From d32492ce8ad600b1693115761cc6b200b8f8381c Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Mon, 29 Aug 2022 18:26:56 +0200
Subject: [PATCH 01/87] Fix GROUPING function SQL compatibility

---
 src/Functions/grouping.h                      |  6 +--
 .../02315_grouping_constant_folding.reference | 52 +++++++++----------
 2 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/src/Functions/grouping.h b/src/Functions/grouping.h
index a49e946b2cb..dc630123dcb 100644
--- a/src/Functions/grouping.h
+++ b/src/Functions/grouping.h
@@ -3,7 +3,6 @@
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnConst.h>
-#include <Columns/ColumnFixedString.h>
 #include <Core/ColumnNumbers.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <Functions/FunctionFactory.h>
@@ -54,7 +53,7 @@ public:
 
             UInt64 value = 0;
             for (auto index : arguments_indexes)
-                value = (value << 1) + (checker(set_index, index) ? 1 : 0);
+                value = (value << 1) + (checker(set_index, index) ? 0 : 1);
 
             result_data.push_back(value);
         }
@@ -73,8 +72,7 @@ public:
 
     ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
     {
-        UInt64 value = (ONE << arguments_indexes.size()) - 1;
-        return ColumnUInt64::create(input_rows_count, value);
+        return ColumnUInt64::create(input_rows_count, 0);
     }
 };
 
diff --git a/tests/queries/0_stateless/02315_grouping_constant_folding.reference b/tests/queries/0_stateless/02315_grouping_constant_folding.reference
index 5aa979b1453..c44fee183da 100644
--- a/tests/queries/0_stateless/02315_grouping_constant_folding.reference
+++ b/tests/queries/0_stateless/02315_grouping_constant_folding.reference
@@ -1,29 +1,29 @@
 -- { echoOn }
 SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b);
-1	0	0	3
-1	0	2	3
-1	0	4	3
-1	0	6	3
-1	0	8	3
-1	1	1	3
-1	1	3	3
-1	1	5	3
-1	1	7	3
-1	1	9	3
-5	0	0	2
-5	1	0	2
-10	0	0	0
+1	0	0	0
+1	0	2	0
+1	0	4	0
+1	0	6	0
+1	0	8	0
+1	1	1	0
+1	1	3	0
+1	1	5	0
+1	1	7	0
+1	1	9	0
+5	0	0	1
+5	1	0	1
+10	0	0	3
 SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b);
-1	0	0	3
-1	0	2	3
-1	0	4	3
-1	0	6	3
-1	0	8	3
-1	1	1	3
-1	1	3	3
-1	1	5	3
-1	1	7	3
-1	1	9	3
-5	0	0	2
-5	1	0	2
-10	0	0	0
+1	0	0	0
+1	0	2	0
+1	0	4	0
+1	0	6	0
+1	0	8	0
+1	1	1	0
+1	1	3	0
+1	1	5	0
+1	1	7	0
+1	1	9	0
+5	0	0	1
+5	1	0	1
+10	0	0	3

From 003483b6165cbee284f91c967fe5c5547a40c733 Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Tue, 30 Aug 2022 20:49:40 +0200
Subject: [PATCH 02/87] Add compatibility setting

---
 src/Core/Settings.h                           |  2 +
 src/Core/SettingsChangesHistory.h             |  1 +
 src/Functions/grouping.h                      | 33 +++++++----
 src/Interpreters/ActionsVisitor.cpp           |  8 +--
 .../02315_grouping_constant_folding.reference | 56 +++++++++----------
 .../02315_grouping_constant_folding.sql       |  4 +-
 6 files changed, 59 insertions(+), 45 deletions(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index af32c15a867..b41284ef20a 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -612,6 +612,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     M(Bool, allow_deprecated_database_ordinary, false, "Allow to create databases with deprecated Ordinary engine", 0) \
     M(Bool, allow_deprecated_syntax_for_merge_tree, false, "Allow to create *MergeTree tables with deprecated engine definition syntax", 0) \
     \
+    M(Bool, force_grouping_standard_compatibility, true, "Make GROUPING function to return 1 when argument is not used as an aggregation key", 0) \
+    \
     M(Bool, schema_inference_use_cache_for_file, true, "Use cache in schema inference while using file table function", 0) \
     M(Bool, schema_inference_use_cache_for_s3, true, "Use cache in schema inference while using s3 table function", 0) \
     M(Bool, schema_inference_use_cache_for_hdfs, true, "Use cache in schema inference while using hdfs table function", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index 8d0e69f4b29..9440cd8bae2 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -78,6 +78,7 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
+        {"22.9", {{"force_grouping_standard_compatibility", false, true, "Make GROUPING function output the same as in SQL standard and other DBMS"}}},
         {"22.7", {{"cross_to_inner_join_rewrite", 1, 2, "Force rewrite comma join to inner"},
                   {"enable_positional_arguments", false, true, "Enable positional arguments feature by default"},
                   {"format_csv_allow_single_quotes", true, false, "Most tools don't treat single quote in CSV specially, don't do it by default too"}}},
diff --git a/src/Functions/grouping.h b/src/Functions/grouping.h
index dc630123dcb..7a9df462b23 100644
--- a/src/Functions/grouping.h
+++ b/src/Functions/grouping.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <base/defines.h>
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnConst.h>
@@ -18,10 +19,15 @@ protected:
     static constexpr UInt64 ONE = 1;
 
     const ColumnNumbers arguments_indexes;
+    const bool force_compatibility;
+
+    static constexpr UInt64 COMPATIBLE_MODE[]   = {1, 0};
+    static constexpr UInt64 INCOMPATIBLE_MODE[] = {0, 1};
 
 public:
-    FunctionGroupingBase(ColumnNumbers arguments_indexes_)
+    FunctionGroupingBase(ColumnNumbers arguments_indexes_, bool force_compatibility_)
         : arguments_indexes(std::move(arguments_indexes_))
+        , force_compatibility(force_compatibility_)
     {}
 
     bool isVariadic() const override { return true; }
@@ -47,13 +53,15 @@ public:
         auto result = ColumnUInt64::create();
         auto & result_data = result->getData();
         result_data.reserve(input_rows_count);
+
+        const auto * result_table = likely(force_compatibility) ? COMPATIBLE_MODE : INCOMPATIBLE_MODE;
         for (size_t i = 0; i < input_rows_count; ++i)
         {
             UInt64 set_index = grouping_set_column->getElement(i);
 
             UInt64 value = 0;
             for (auto index : arguments_indexes)
-                value = (value << 1) + (checker(set_index, index) ? 0 : 1);
+                value = (value << 1) + result_table[checker(set_index, index) ? 1 : 0];
 
             result_data.push_back(value);
         }
@@ -64,15 +72,18 @@ public:
 class FunctionGroupingOrdinary : public FunctionGroupingBase
 {
 public:
-    explicit FunctionGroupingOrdinary(ColumnNumbers arguments_indexes_)
-        : FunctionGroupingBase(std::move(arguments_indexes_))
+    FunctionGroupingOrdinary(ColumnNumbers arguments_indexes_, bool force_compatibility_)
+        : FunctionGroupingBase(std::move(arguments_indexes_), force_compatibility_)
     {}
 
     String getName() const override { return "groupingOrdinary"; }
 
     ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
     {
-        return ColumnUInt64::create(input_rows_count, 0);
+        if (likely(force_compatibility))
+            return ColumnUInt64::create(input_rows_count, 0);
+        UInt64 value = (ONE << arguments_indexes.size()) - 1;
+        return ColumnUInt64::create(input_rows_count, value);
     }
 };
 
@@ -81,8 +92,8 @@ class FunctionGroupingForRollup : public FunctionGroupingBase
     const UInt64 aggregation_keys_number;
 
 public:
-    FunctionGroupingForRollup(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_)
-        : FunctionGroupingBase(std::move(arguments_indexes_))
+    FunctionGroupingForRollup(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_, bool force_compatibility_)
+        : FunctionGroupingBase(std::move(arguments_indexes_), force_compatibility_)
         , aggregation_keys_number(aggregation_keys_number_)
     {}
 
@@ -111,8 +122,8 @@ class FunctionGroupingForCube : public FunctionGroupingBase
 
 public:
 
-    FunctionGroupingForCube(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_)
-        : FunctionGroupingBase(arguments_indexes_)
+    FunctionGroupingForCube(ColumnNumbers arguments_indexes_, UInt64 aggregation_keys_number_, bool force_compatibility_)
+        : FunctionGroupingBase(arguments_indexes_, force_compatibility_)
         , aggregation_keys_number(aggregation_keys_number_)
     {}
 
@@ -140,8 +151,8 @@ class FunctionGroupingForGroupingSets : public FunctionGroupingBase
 {
     ColumnNumbersSetList grouping_sets;
 public:
-    FunctionGroupingForGroupingSets(ColumnNumbers arguments_indexes_, ColumnNumbersList const & grouping_sets_)
-        : FunctionGroupingBase(std::move(arguments_indexes_))
+    FunctionGroupingForGroupingSets(ColumnNumbers arguments_indexes_, ColumnNumbersList const & grouping_sets_, bool force_compatibility_)
+        : FunctionGroupingBase(std::move(arguments_indexes_), force_compatibility_)
     {
         for (auto const & set : grouping_sets_)
             grouping_sets.emplace_back(set.begin(), set.end());
diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp
index 6c9e54a966d..0412ce14b54 100644
--- a/src/Interpreters/ActionsVisitor.cpp
+++ b/src/Interpreters/ActionsVisitor.cpp
@@ -880,20 +880,20 @@ void ActionsMatcher::visit(const ASTFunction & node, const ASTPtr & ast, Data &
         {
             case GroupByKind::GROUPING_SETS:
             {
-                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingForGroupingSets>(std::move(arguments_indexes), keys_info.grouping_set_keys)), { "__grouping_set" }, column_name);
+                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingForGroupingSets>(std::move(arguments_indexes), keys_info.grouping_set_keys, data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), { "__grouping_set" }, column_name);
                 break;
             }
             case GroupByKind::ROLLUP:
-                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingForRollup>(std::move(arguments_indexes), aggregation_keys_number)), { "__grouping_set" }, column_name);
+                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingForRollup>(std::move(arguments_indexes), aggregation_keys_number, data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), { "__grouping_set" }, column_name);
                 break;
             case GroupByKind::CUBE:
             {
-                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingForCube>(std::move(arguments_indexes), aggregation_keys_number)), { "__grouping_set" }, column_name);
+                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingForCube>(std::move(arguments_indexes), aggregation_keys_number, data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), { "__grouping_set" }, column_name);
                 break;
             }
             case GroupByKind::ORDINARY:
             {
-                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingOrdinary>(std::move(arguments_indexes))), {}, column_name);
+                data.addFunction(std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionGroupingOrdinary>(std::move(arguments_indexes), data.getContext()->getSettingsRef().force_grouping_standard_compatibility)), {}, column_name);
                 break;
             }
             default:
diff --git a/tests/queries/0_stateless/02315_grouping_constant_folding.reference b/tests/queries/0_stateless/02315_grouping_constant_folding.reference
index c44fee183da..6e591de2661 100644
--- a/tests/queries/0_stateless/02315_grouping_constant_folding.reference
+++ b/tests/queries/0_stateless/02315_grouping_constant_folding.reference
@@ -1,29 +1,29 @@
 -- { echoOn }
-SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b);
-1	0	0	0
-1	0	2	0
-1	0	4	0
-1	0	6	0
-1	0	8	0
-1	1	1	0
-1	1	3	0
-1	1	5	0
-1	1	7	0
-1	1	9	0
-5	0	0	1
-5	1	0	1
-10	0	0	3
-SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b);
-1	0	0	0
-1	0	2	0
-1	0	4	0
-1	0	6	0
-1	0	8	0
-1	1	1	0
-1	1	3	0
-1	1	5	0
-1	1	7	0
-1	1	9	0
-5	0	0	1
-5	1	0	1
-10	0	0	3
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0;
+1	0	0	3
+1	0	2	3
+1	0	4	3
+1	0	6	3
+1	0	8	3
+1	1	1	3
+1	1	3	3
+1	1	5	3
+1	1	7	3
+1	1	9	3
+5	0	0	2
+5	1	0	2
+10	0	0	0
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0;
+1	0	0	3
+1	0	2	3
+1	0	4	3
+1	0	6	3
+1	0	8	3
+1	1	1	3
+1	1	3	3
+1	1	5	3
+1	1	7	3
+1	1	9	3
+5	0	0	2
+5	1	0	2
+10	0	0	0
diff --git a/tests/queries/0_stateless/02315_grouping_constant_folding.sql b/tests/queries/0_stateless/02315_grouping_constant_folding.sql
index c4ef087a308..ff259b7be79 100644
--- a/tests/queries/0_stateless/02315_grouping_constant_folding.sql
+++ b/tests/queries/0_stateless/02315_grouping_constant_folding.sql
@@ -5,9 +5,9 @@ CREATE TABLE test02315(a UInt64, b UInt64) ENGINE=MergeTree() ORDER BY (a, b);
 INSERT INTO test02315 SELECT number % 2 as a, number as b FROM numbers(10);
 
 -- { echoOn }
-SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b);
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0;
 
-SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b);
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02315 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b) SETTINGS force_grouping_standard_compatibility=0;
 
 -- { echoOff }
 DROP TABLE test02315;

From 21ab72365a28448bdf44c05aa3cd33115c7f71ca Mon Sep 17 00:00:00 2001
From: MaceWindu <MaceWindu@users.noreply.github.com>
Date: Thu, 1 Sep 2022 11:39:27 +0200
Subject: [PATCH 03/87] Update integrations.md

Add linq2db to list of third-party libraries
---
 docs/en/interfaces/third-party/integrations.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/en/interfaces/third-party/integrations.md b/docs/en/interfaces/third-party/integrations.md
index de496546cb4..aede128e9a4 100644
--- a/docs/en/interfaces/third-party/integrations.md
+++ b/docs/en/interfaces/third-party/integrations.md
@@ -103,6 +103,7 @@ ClickHouse, Inc. does **not** maintain the tools and libraries listed below and
         -   [ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client)
         -   [ClickHouse.Net](https://github.com/ilyabreev/ClickHouse.Net)
         -   [ClickHouse.Net.Migrations](https://github.com/ilyabreev/ClickHouse.Net.Migrations)
+        -   [Linq To DB](https://github.com/linq2db/linq2db)
 -   Elixir
     -   [Ecto](https://github.com/elixir-ecto/ecto)
         -   [clickhouse_ecto](https://github.com/appodeal/clickhouse_ecto)

From 310e1484f6301097698a025d0346adce55ad80a4 Mon Sep 17 00:00:00 2001
From: Aleksandr Musorin <smusorin@mail.ru>
Date: Thu, 1 Sep 2022 13:28:56 +0200
Subject: [PATCH 04/87] docs - updated optional parameters for table functions

---
 docs/en/sql-reference/table-functions/file.md      | 2 +-
 docs/en/sql-reference/table-functions/s3.md        | 2 +-
 docs/en/sql-reference/table-functions/s3Cluster.md | 2 +-
 docs/en/sql-reference/table-functions/url.md       | 2 +-
 docs/ru/sql-reference/table-functions/file.md      | 2 +-
 docs/ru/sql-reference/table-functions/s3.md        | 2 +-
 docs/ru/sql-reference/table-functions/s3Cluster.md | 2 +-
 docs/ru/sql-reference/table-functions/url.md       | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md
index a110bfbd15c..f40107aaaca 100644
--- a/docs/en/sql-reference/table-functions/file.md
+++ b/docs/en/sql-reference/table-functions/file.md
@@ -13,7 +13,7 @@ Creates a table from a file. This table function is similar to [url](../../sql-r
 **Syntax**
 
 ``` sql
-file(path, format, structure)
+file(path [,format] [,structure])
 ```
 
 **Parameters**
diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md
index 2df7d6e46b3..545037665bb 100644
--- a/docs/en/sql-reference/table-functions/s3.md
+++ b/docs/en/sql-reference/table-functions/s3.md
@@ -11,7 +11,7 @@ Provides table-like interface to select/insert files in [Amazon S3](https://aws.
 **Syntax**
 
 ``` sql
-s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression])
+s3(path [,aws_access_key_id, aws_secret_access_key] [,format] [,structure] [,compression])
 ```
 
 **Arguments**
diff --git a/docs/en/sql-reference/table-functions/s3Cluster.md b/docs/en/sql-reference/table-functions/s3Cluster.md
index ec6a73e4cbb..dab76ade780 100644
--- a/docs/en/sql-reference/table-functions/s3Cluster.md
+++ b/docs/en/sql-reference/table-functions/s3Cluster.md
@@ -11,7 +11,7 @@ Allows processing files from [Amazon S3](https://aws.amazon.com/s3/) in parallel
 **Syntax**
 
 ``` sql
-s3Cluster(cluster_name, source, [access_key_id, secret_access_key,] format, structure)
+s3Cluster(cluster_name, source, [,access_key_id, secret_access_key] [,format] [,structure])
 ```
 
 **Arguments**
diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md
index f1ed7b4dfe4..014dc3ae853 100644
--- a/docs/en/sql-reference/table-functions/url.md
+++ b/docs/en/sql-reference/table-functions/url.md
@@ -13,7 +13,7 @@ sidebar_label: url
 **Syntax**
 
 ``` sql
-url(URL, format, structure)
+url(URL [,format] [,structure])
 ```
 
 **Parameters**
diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md
index 1f262c9403a..df35a1c4ac0 100644
--- a/docs/ru/sql-reference/table-functions/file.md
+++ b/docs/ru/sql-reference/table-functions/file.md
@@ -13,7 +13,7 @@ sidebar_label: file
 **Синтаксис**
 
 ``` sql
-file(path, format, structure)
+file(path [,format] [,structure])
 ```
 
 **Параметры**
diff --git a/docs/ru/sql-reference/table-functions/s3.md b/docs/ru/sql-reference/table-functions/s3.md
index ae0419a4b84..14c8204fd1d 100644
--- a/docs/ru/sql-reference/table-functions/s3.md
+++ b/docs/ru/sql-reference/table-functions/s3.md
@@ -11,7 +11,7 @@ sidebar_label: s3
 **Синтаксис**
 
 ``` sql
-s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compression])
+s3(path [,aws_access_key_id, aws_secret_access_key] [,format] [,structure] [,compression])
 ```
 
 **Aргументы**
diff --git a/docs/ru/sql-reference/table-functions/s3Cluster.md b/docs/ru/sql-reference/table-functions/s3Cluster.md
index e6b317253c0..1c12913fabe 100644
--- a/docs/ru/sql-reference/table-functions/s3Cluster.md
+++ b/docs/ru/sql-reference/table-functions/s3Cluster.md
@@ -11,7 +11,7 @@ sidebar_label: s3Cluster
 **Синтаксис**
 
 ``` sql
-s3Cluster(cluster_name, source, [access_key_id, secret_access_key,] format, structure)
+s3Cluster(cluster_name, source, [,access_key_id, secret_access_key] [,format] [,structure])
 ```
 
 **Аргументы**
diff --git a/docs/ru/sql-reference/table-functions/url.md b/docs/ru/sql-reference/table-functions/url.md
index d4fb11b0de7..e5d9faeec00 100644
--- a/docs/ru/sql-reference/table-functions/url.md
+++ b/docs/ru/sql-reference/table-functions/url.md
@@ -13,7 +13,7 @@ sidebar_label: url
 **Синтаксис**
 
 ``` sql
-url(URL, format, structure)
+url(URL [,format] [,structure])
 ```
 
 **Параметры**

From 94b74d46a6eb9b18732389322bbd439a357d1052 Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Thu, 1 Sep 2022 17:05:04 +0200
Subject: [PATCH 05/87] Update tests

---
 .../02293_grouping_function.reference         | 22 +++++++++------
 .../0_stateless/02293_grouping_function.sql   | 22 +++++++++------
 ...02293_grouping_function_group_by.reference | 27 ++++++++++++-------
 .../02293_grouping_function_group_by.sql      | 27 ++++++++++++-------
 4 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/tests/queries/0_stateless/02293_grouping_function.reference b/tests/queries/0_stateless/02293_grouping_function.reference
index e71d6812ab5..7d745a0e0fa 100644
--- a/tests/queries/0_stateless/02293_grouping_function.reference
+++ b/tests/queries/0_stateless/02293_grouping_function.reference
@@ -8,7 +8,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	1
 0	1
 0	2
@@ -30,7 +31,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	1
 0	2
 0	2
@@ -52,7 +54,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	1
 0	1
@@ -73,7 +76,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, grouping(number, number % 2) = 1;
+ORDER BY number, grouping(number, number % 2) = 1
+SETTINGS force_grouping_standard_compatibility=0;
 0
 0
 0
@@ -97,7 +101,8 @@ GROUP BY
         (number, number % 2),
         ()
     )
-ORDER BY (gr, number);
+ORDER BY (gr, number)
+SETTINGS force_grouping_standard_compatibility=0;
 0	10	0
 0	1	2
 1	1	2
@@ -129,7 +134,7 @@ GROUP BY
     )
 HAVING grouping(number, number % 2) = 2
 ORDER BY number
-SETTINGS enable_optimize_predicate_expression = 0;
+SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0;
 0
 1
 2
@@ -150,7 +155,7 @@ GROUP BY
     )
 HAVING grouping(number, number % 2) = 1
 ORDER BY number
-SETTINGS enable_optimize_predicate_expression = 0;
+SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0;
 0
 0
 SELECT
@@ -161,7 +166,8 @@ GROUP BY
     GROUPING SETS (
     (number),
     (number % 2))
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	1
 0	1
diff --git a/tests/queries/0_stateless/02293_grouping_function.sql b/tests/queries/0_stateless/02293_grouping_function.sql
index 169fc09c324..cf076c8e51c 100644
--- a/tests/queries/0_stateless/02293_grouping_function.sql
+++ b/tests/queries/0_stateless/02293_grouping_function.sql
@@ -19,7 +19,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -30,7 +31,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -41,7 +43,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number
@@ -51,7 +54,8 @@ GROUP BY
         (number),
         (number % 2)
     )
-ORDER BY number, grouping(number, number % 2) = 1;
+ORDER BY number, grouping(number, number % 2) = 1
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -64,7 +68,8 @@ GROUP BY
         (number, number % 2),
         ()
     )
-ORDER BY (gr, number);
+ORDER BY (gr, number)
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number
@@ -76,7 +81,7 @@ GROUP BY
     )
 HAVING grouping(number, number % 2) = 2
 ORDER BY number
-SETTINGS enable_optimize_predicate_expression = 0;
+SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0;
 
 SELECT
     number
@@ -88,7 +93,7 @@ GROUP BY
     )
 HAVING grouping(number, number % 2) = 1
 ORDER BY number
-SETTINGS enable_optimize_predicate_expression = 0;
+SETTINGS enable_optimize_predicate_expression = 0, force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -98,4 +103,5 @@ GROUP BY
     GROUPING SETS (
     (number),
     (number % 2))
-ORDER BY number, gr;
+ORDER BY number, gr
+SETTINGS force_grouping_standard_compatibility=0;
diff --git a/tests/queries/0_stateless/02293_grouping_function_group_by.reference b/tests/queries/0_stateless/02293_grouping_function_group_by.reference
index 7f87aecd4bd..49cdca1411e 100644
--- a/tests/queries/0_stateless/02293_grouping_function_group_by.reference
+++ b/tests/queries/0_stateless/02293_grouping_function_group_by.reference
@@ -6,7 +6,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     number,
     number % 2
-ORDER BY number;
+ORDER BY number
+SETTINGS force_grouping_standard_compatibility=0;
 0	1
 1	1
 2	1
@@ -25,7 +26,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     number,
     number % 2
-ORDER BY number;
+ORDER BY number
+SETTINGS force_grouping_standard_compatibility=0;
 0	1	1
 1	1	1
 2	1	1
@@ -45,7 +47,8 @@ GROUP BY
     number % 2
     WITH ROLLUP
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	2
 0	3
@@ -74,7 +77,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     ROLLUP(number, number % 2)
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	2
 0	3
@@ -105,7 +109,8 @@ GROUP BY
     number % 2
     WITH CUBE
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	1
 0	1
@@ -136,7 +141,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     CUBE(number, number % 2)
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	1
 0	1
@@ -168,7 +174,8 @@ GROUP BY
     CUBE(number, number % 2)
 HAVING grouping(number) != 0
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	5
 0	6
 1	5
@@ -205,7 +212,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     CUBE(number, number % 2) WITH TOTALS
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	1
 0	1
@@ -247,7 +255,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     ROLLUP(number, number % 2) WITH TOTALS
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 0	0
 0	2
 0	3
diff --git a/tests/queries/0_stateless/02293_grouping_function_group_by.sql b/tests/queries/0_stateless/02293_grouping_function_group_by.sql
index 9bf9d43478b..d438a8a5277 100644
--- a/tests/queries/0_stateless/02293_grouping_function_group_by.sql
+++ b/tests/queries/0_stateless/02293_grouping_function_group_by.sql
@@ -15,7 +15,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     number,
     number % 2
-ORDER BY number;
+ORDER BY number
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -25,7 +26,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     number,
     number % 2
-ORDER BY number;
+ORDER BY number
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -36,7 +38,8 @@ GROUP BY
     number % 2
     WITH ROLLUP
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -45,7 +48,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     ROLLUP(number, number % 2)
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -56,7 +60,8 @@ GROUP BY
     number % 2
     WITH CUBE
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -65,7 +70,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     CUBE(number, number % 2)
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -75,7 +81,8 @@ GROUP BY
     CUBE(number, number % 2)
 HAVING grouping(number) != 0
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -94,7 +101,8 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     CUBE(number, number % 2) WITH TOTALS
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;
 
 SELECT
     number,
@@ -113,4 +121,5 @@ FROM remote('127.0.0.{2,3}', numbers(10))
 GROUP BY
     ROLLUP(number, number % 2) WITH TOTALS
 ORDER BY
-    number, gr;
+    number, gr
+SETTINGS force_grouping_standard_compatibility=0;

From f71a7e028688097d966cfbc7e29031b38307c7a0 Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Thu, 1 Sep 2022 17:19:29 +0200
Subject: [PATCH 06/87] Add test for compatibility

---
 ..._grouping_function_compatibility.reference | 29 +++++++++++++++++++
 .../02416_grouping_function_compatibility.sql | 14 +++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 tests/queries/0_stateless/02416_grouping_function_compatibility.reference
 create mode 100644 tests/queries/0_stateless/02416_grouping_function_compatibility.sql

diff --git a/tests/queries/0_stateless/02416_grouping_function_compatibility.reference b/tests/queries/0_stateless/02416_grouping_function_compatibility.reference
new file mode 100644
index 00000000000..c9a3ad2f593
--- /dev/null
+++ b/tests/queries/0_stateless/02416_grouping_function_compatibility.reference
@@ -0,0 +1,29 @@
+-- { echoOn }
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b);
+1	0	0	0
+1	0	2	0
+1	0	4	0
+1	0	6	0
+1	0	8	0
+1	1	1	0
+1	1	3	0
+1	1	5	0
+1	1	7	0
+1	1	9	0
+5	0	0	1
+5	1	0	1
+10	0	0	3
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b);
+1	0	0	0
+1	0	2	0
+1	0	4	0
+1	0	6	0
+1	0	8	0
+1	1	1	0
+1	1	3	0
+1	1	5	0
+1	1	7	0
+1	1	9	0
+5	0	0	1
+5	1	0	1
+10	0	0	3
diff --git a/tests/queries/0_stateless/02416_grouping_function_compatibility.sql b/tests/queries/0_stateless/02416_grouping_function_compatibility.sql
new file mode 100644
index 00000000000..ed21055ade5
--- /dev/null
+++ b/tests/queries/0_stateless/02416_grouping_function_compatibility.sql
@@ -0,0 +1,14 @@
+DROP TABLE IF EXISTS test02416;
+
+CREATE TABLE test02416(a UInt64, b UInt64) ENGINE=MergeTree() ORDER BY (a, b);
+
+INSERT INTO test02416 SELECT number % 2 as a, number as b FROM numbers(10);
+
+-- { echoOn }
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY GROUPING SETS ((a, b), (a), ()) ORDER BY (amount, a, b);
+
+SELECT count() AS amount, a, b, GROUPING(a, b) FROM test02416 GROUP BY ROLLUP(a, b) ORDER BY (amount, a, b);
+
+-- { echoOff }
+DROP TABLE test02416;
+

From 3d65e3f2eed31c9891d989e1a3cb437dcd5a431d Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Mon, 5 Sep 2022 16:37:55 +0800
Subject: [PATCH 07/87] Add cluster/distributed/remote to file

---
 src/Storages/Distributed/DirectoryMonitor.cpp | 28 +++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/Storages/Distributed/DirectoryMonitor.cpp b/src/Storages/Distributed/DirectoryMonitor.cpp
index 16981d26146..f84ddeb4f5e 100644
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@@ -140,6 +140,11 @@ namespace
         size_t rows = 0;
         size_t bytes = 0;
 
+        UInt32 shard_num = 0;
+        std::string cluster_name;
+        std::string distributed_table;
+        std::string remote_table;
+
         /// dumpStructure() of the header -- obsolete
         std::string block_header_string;
         Block block_header;
@@ -195,6 +200,14 @@ namespace
                             in.getFileName(), distributed_header.revision, DBMS_TCP_PROTOCOL_VERSION);
             }
 
+            if (header_buf.hasPendingData())
+            {
+              readVarUInt(distributed_header.shard_num, header_buf);
+              readStringBinary(distributed_header.cluster_name, header_buf);
+              readStringBinary(distributed_header.distributed_table, header_buf);
+              readStringBinary(distributed_header.remote_table, header_buf);
+            }
+
             /// Add handling new data here, for example:
             ///
             /// if (header_buf.hasPendingData())
@@ -621,18 +634,23 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
         ReadBufferFromFile in(file_path);
         const auto & distributed_header = readDistributedHeader(in, log);
 
-        auto connection = pool->get(timeouts, &distributed_header.insert_settings);
+        thread_trace_context = std::make_unique<OpenTelemetry::TracingContextHolder>(__PRETTY_FUNCTION__,
+            distributed_header.client_info.client_trace_context,
+            this->storage.getContext()->getOpenTelemetrySpanLog());
+        thread_trace_context->root_span.addAttribute("clickhouse.shard_num", distributed_header.shard_num);
+        thread_trace_context->root_span.addAttribute("clickhouse.cluster", distributed_header.cluster_name);
+        thread_trace_context->root_span.addAttribute("clickhouse.distributed", distributed_header.distributed_table);
+        thread_trace_context->root_span.addAttribute("clickhouse.remote", distributed_header.remote_table);
+        thread_trace_context->root_span.addAttribute("clickhouse.rows", distributed_header.rows);
+        thread_trace_context->root_span.addAttribute("clickhouse.bytes", distributed_header.bytes);
 
+        auto connection = pool->get(timeouts, &distributed_header.insert_settings);
         LOG_DEBUG(log, "Sending `{}` to {} ({} rows, {} bytes)",
             file_path,
             connection->getDescription(),
             formatReadableQuantity(distributed_header.rows),
             formatReadableSizeWithBinarySuffix(distributed_header.bytes));
 
-        thread_trace_context = std::make_unique<OpenTelemetry::TracingContextHolder>(__PRETTY_FUNCTION__,
-            distributed_header.client_info.client_trace_context,
-            this->storage.getContext()->getOpenTelemetrySpanLog());
-
         RemoteInserter remote{*connection, timeouts,
             distributed_header.insert_query,
             distributed_header.insert_settings,

From a17bc51d5b245b40870abb6caaecd16924eeac32 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Mon, 5 Sep 2022 16:39:47 +0800
Subject: [PATCH 08/87] Save cluster/distributed/table to log

---
 src/Storages/Distributed/DistributedSink.cpp | 27 ++++++++++++--------
 src/Storages/Distributed/DistributedSink.h   |  4 +--
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp
index ae72fdd84e2..0e379a7bd89 100644
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@@ -338,7 +338,11 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si
 
         OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
         span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
-        span.addAttribute("clickhouse.written_rows", rows);
+        span.addAttribute("clickhouse.cluster", this->storage.cluster_name);
+        span.addAttribute("clickhouse.distributed", this->storage.getStorageID().getFullNameNotQuoted());
+        span.addAttribute("clickhouse.remote", [this]() { return storage.remote_database + "." + storage.remote_table; });
+        span.addAttribute("clickhouse.rows", [rows]() { return std::to_string(rows); });
+        span.addAttribute("clickhouse.bytes", [&shard_block]() { return std::to_string(shard_block.bytes()); });
 
         if (!job.is_local_job || !settings.prefer_localhost_replica)
         {
@@ -610,20 +614,15 @@ void DistributedSink::writeSplitAsync(const Block & block)
 
 void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
 {
-    OpenTelemetry::SpanHolder span("DistributedSink::writeAsyncImpl()");
-
     const auto & shard_info = cluster->getShardsInfo()[shard_id];
     const auto & settings = context->getSettingsRef();
     Block block_to_send = removeSuperfluousColumns(block);
 
-    span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
-    span.addAttribute("clickhouse.written_rows", block.rows());
-
     if (shard_info.hasInternalReplication())
     {
         if (shard_info.isLocal() && settings.prefer_localhost_replica)
             /// Prefer insert into current instance directly
-            writeToLocal(block_to_send, shard_info.getLocalNodeCount());
+            writeToLocal(shard_info, block_to_send, shard_info.getLocalNodeCount());
         else
         {
             const auto & path = shard_info.insertPathForInternalReplication(
@@ -631,13 +630,13 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
                 settings.use_compact_format_in_distributed_parts_names);
             if (path.empty())
                 throw Exception("Directory name for async inserts is empty", ErrorCodes::LOGICAL_ERROR);
-            writeToShard(block_to_send, {path});
+            writeToShard(shard_info, block_to_send, {path});
         }
     }
     else
     {
         if (shard_info.isLocal() && settings.prefer_localhost_replica)
-            writeToLocal(block_to_send, shard_info.getLocalNodeCount());
+            writeToLocal(shard_info, block_to_send, shard_info.getLocalNodeCount());
 
         std::vector<std::string> dir_names;
         for (const auto & address : cluster->getShardsAddresses()[shard_id])
@@ -645,7 +644,7 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
                 dir_names.push_back(address.toFullString(settings.use_compact_format_in_distributed_parts_names));
 
         if (!dir_names.empty())
-            writeToShard(block_to_send, dir_names);
+            writeToShard(shard_info, block_to_send, dir_names);
     }
 }
 
@@ -666,9 +665,10 @@ void DistributedSink::writeToLocal(const Block & block, size_t repeats)
 }
 
 
-void DistributedSink::writeToShard(const Block & block, const std::vector<std::string> & dir_names)
+void DistributedSink::writeToShard(const Cluster::ShardInfo& shard_info, const Block & block, const std::vector<std::string> & dir_names)
 {
     OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
+    span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
 
     const auto & settings = context->getSettingsRef();
     const auto & distributed_settings = storage.getDistributedSettingsRef();
@@ -759,6 +759,11 @@ void DistributedSink::writeToShard(const Block & block, const std::vector<std::s
                 header_stream.write(block.cloneEmpty());
             }
 
+            writeVarUInt(shard_info.shard_num, header_buf);
+            writeStringBinary(this->storage.cluster_name, header_buf);
+            writeStringBinary(this->storage.getStorageID().getFullNameNotQuoted(), header_buf);
+            writeStringBinary(this->storage.remote_database + "." + this->storage.remote_table, header_buf);
+
             /// Add new fields here, for example:
             /// writeVarUInt(my_new_data, header_buf);
             /// And note that it is safe, because we have checksum and size for header.
diff --git a/src/Storages/Distributed/DistributedSink.h b/src/Storages/Distributed/DistributedSink.h
index 668cec22e8b..5d7a5268865 100644
--- a/src/Storages/Distributed/DistributedSink.h
+++ b/src/Storages/Distributed/DistributedSink.h
@@ -69,9 +69,9 @@ private:
     Block removeSuperfluousColumns(Block block) const;
 
     /// Increments finished_writings_count after each repeat.
-    void writeToLocal(const Block & block, size_t repeats);
+    void writeToLocal(const Cluster::ShardInfo& shard_info, const Block & block, size_t repeats);
 
-    void writeToShard(const Block & block, const std::vector<std::string> & dir_names);
+    void writeToShard(const Cluster::ShardInfo& shard_info, const Block & block, const std::vector<std::string> & dir_names);
 
 
     /// Performs synchronous insertion to remote nodes. If timeout_exceeded flag was set, throws.

From 6ab1549d6c32182253586c0c7714a2ecce7a8fd1 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Mon, 5 Sep 2022 16:40:48 +0800
Subject: [PATCH 09/87] Update writeToLocal to record related info

---
 src/Storages/Distributed/DistributedSink.cpp | 28 +++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp
index 0e379a7bd89..dc33cfa4b60 100644
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@@ -649,19 +649,33 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
 }
 
 
-void DistributedSink::writeToLocal(const Block & block, size_t repeats)
+void DistributedSink::writeToLocal(const Cluster::ShardInfo& shard_info, const Block & block, size_t repeats)
 {
     OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
     span.addAttribute("db.statement", this->query_string);
+    span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
+    span.addAttribute("clickhouse.cluster", this->storage.cluster_name);
+    span.addAttribute("clickhouse.distributed", this->storage.getStorageID().getFullNameNotQuoted());
+    span.addAttribute("clickhouse.remote", [this]() { return storage.remote_database + "." + storage.remote_table; });
+    span.addAttribute("clickhouse.rows", [&block]() { return std::to_string(block.rows()); });
+    span.addAttribute("clickhouse.bytes", [&block]() { return std::to_string(block.bytes()); });
 
-    InterpreterInsertQuery interp(query_ast, context, allow_materialized);
+    try
+    {
+        InterpreterInsertQuery interp(query_ast, context, allow_materialized);
 
-    auto block_io = interp.execute();
-    PushingPipelineExecutor executor(block_io.pipeline);
+        auto block_io = interp.execute();
+        PushingPipelineExecutor executor(block_io.pipeline);
 
-    executor.start();
-    writeBlockConvert(executor, block, repeats, log);
-    executor.finish();
+        executor.start();
+        writeBlockConvert(executor, block, repeats, log);
+        executor.finish();
+    }
+    catch (...)
+    {
+        span.addAttribute(std::current_exception());
+        throw;
+    }
 }
 
 
From 8365e7bfac1f210cf664b833507d3771f1981640 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 6 Sep 2022 15:41:21 +0800
Subject: [PATCH 10/87] Remove extra attribute

---
 src/Storages/Distributed/DistributedSink.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp
index dc33cfa4b60..57397c6908e 100644
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@@ -171,7 +171,6 @@ void DistributedSink::writeAsync(const Block & block)
     }
     else
     {
-
         if (storage.getShardingKeyExpr() && (cluster->getShardsInfo().size() > 1))
             return writeSplitAsync(block);
 
@@ -652,7 +651,6 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
 void DistributedSink::writeToLocal(const Cluster::ShardInfo& shard_info, const Block & block, size_t repeats)
 {
     OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
-    span.addAttribute("db.statement", this->query_string);
     span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
     span.addAttribute("clickhouse.cluster", this->storage.cluster_name);
     span.addAttribute("clickhouse.distributed", this->storage.getStorageID().getFullNameNotQuoted());

From 49556dad975052c748bb8db6ae414da58bf2094d Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 6 Sep 2022 15:42:45 +0800
Subject: [PATCH 11/87] Add test cases

---
 ...etry_insert_on_distributed_table.reference |  4 +
 ...entelemetry_insert_on_distributed_table.sh | 84 +++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
 create mode 100755 tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh

diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
new file mode 100644
index 00000000000..fac9fabce8a
--- /dev/null
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
@@ -0,0 +1,4 @@
+{'clickhouse.shard_num':'1','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
+{'clickhouse.shard_num':'2','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
+{'clickhouse.shard_num':'1','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
+{'clickhouse.shard_num':'2','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
new file mode 100755
index 00000000000..6f766e9f3bb
--- /dev/null
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Tags: distributed
+
+set -ue
+
+unset CLICKHOUSE_LOG_COMMENT
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+
+${CLICKHOUSE_CLIENT} -nq "
+SET distributed_ddl_output_mode = 'none';
+
+SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
+TRUNCATE TABLE IF EXISTS system.opentelemetry_span_log ON CLUSTER test_cluster_two_shards;
+
+DROP TABLE IF EXISTS default.dist_opentelemetry ON CLUSTER test_cluster_two_shards;
+DROP TABLE IF EXISTS default.local_opentelemetry ON CLUSTER test_cluster_two_shards;
+
+CREATE TABLE default.dist_opentelemetry  ON CLUSTER test_cluster_two_shards (key UInt64) Engine=Distributed('test_cluster_two_shards', default, local_opentelemetry, key % 2);
+CREATE TABLE default.local_opentelemetry ON CLUSTER test_cluster_two_shards (key UInt64) Engine=MergeTree ORDER BY key;
+"
+
+#
+# INSERT ASYNC test
+# Do test with opentelemetry enabled
+#
+${CLICKHOUSE_CLIENT} -nq "
+-- Make sure it's async
+SET insert_distributed_sync=0;
+INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1 VALUES(1),(2);
+"
+
+# Wait complete of ASYNC INSERT on distributed table
+wait
+
+# Check log
+${CLICKHOUSE_CLIENT} -nq "
+-- Flush opentelemetry span log on all nodes
+SET distributed_ddl_output_mode = 'none';
+SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
+
+-- Above INSERT will insert data to two shards respectively, so there will be two spans generated
+SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%writeToLocal%';
+SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%processFile%';
+"
+
+#
+# INSERT SYNC test
+# Do test with opentelemetry enabled and in SYNC mode
+#
+${CLICKHOUSE_CLIENT} -nq "
+
+-- Clear log
+SET distributed_ddl_output_mode = 'none';
+TRUNCATE TABLE IF EXISTS system.opentelemetry_span_log ON CLUSTER test_cluster_two_shards;
+
+-- Make sure it's SYNC
+SET insert_distributed_sync=1;
+
+-- INSERT test
+INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1 VALUES(1),(2);
+"
+
+# Check log
+${CLICKHOUSE_CLIENT} -nq "
+-- Flush opentelemetry span log on all nodes
+SET distributed_ddl_output_mode = 'none';
+SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
+
+-- Above INSERT will insert data to two shards in the same flow, so there should be two spans generated with the same operation name
+SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%runWritingJob%';
+"
+
+#
+# Cleanup
+#
+${CLICKHOUSE_CLIENT} -nq "
+SET distributed_ddl_output_mode = 'none';
+DROP TABLE default.dist_opentelemetry  ON CLUSTER test_cluster_two_shards;
+DROP TABLE default.local_opentelemetry ON CLUSTER test_cluster_two_shards;
+"

From 206709603502b8f0f8f99996d332458976f15c7f Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 6 Sep 2022 16:01:31 +0800
Subject: [PATCH 12/87] Optimize span log for SYNC insert

---
 src/Storages/Distributed/DistributedSink.cpp | 22 +++++++++++---------
 src/Storages/Distributed/DistributedSink.h   |  4 ++--
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp
index 57397c6908e..8099a7f2002 100644
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@@ -290,6 +290,8 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si
     auto thread_group = CurrentThread::getGroup();
     return [this, thread_group, &job, &current_block, num_shards]()
     {
+        OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
+
         if (thread_group)
             CurrentThread::attachToIfDetached(thread_group);
         setThreadName("DistrOutStrProc");
@@ -330,18 +332,18 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si
         const Block & shard_block = (num_shards > 1) ? job.current_shard_block : current_block;
         const Settings & settings = context->getSettingsRef();
 
-        /// Do not initiate INSERT for empty block.
         size_t rows = shard_block.rows();
-        if (rows == 0)
-            return;
 
-        OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
         span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
         span.addAttribute("clickhouse.cluster", this->storage.cluster_name);
         span.addAttribute("clickhouse.distributed", this->storage.getStorageID().getFullNameNotQuoted());
         span.addAttribute("clickhouse.remote", [this]() { return storage.remote_database + "." + storage.remote_table; });
-        span.addAttribute("clickhouse.rows", [rows]() { return std::to_string(rows); });
-        span.addAttribute("clickhouse.bytes", [&shard_block]() { return std::to_string(shard_block.bytes()); });
+        span.addAttribute("clickhouse.rows", rows);
+        span.addAttribute("clickhouse.bytes", [&shard_block]() { return toString(shard_block.bytes()); });
+
+        /// Do not initiate INSERT for empty block.
+        if (rows == 0)
+            return;
 
         if (!job.is_local_job || !settings.prefer_localhost_replica)
         {
@@ -648,15 +650,15 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
 }
 
 
-void DistributedSink::writeToLocal(const Cluster::ShardInfo& shard_info, const Block & block, size_t repeats)
+void DistributedSink::writeToLocal(const Cluster::ShardInfo & shard_info, const Block & block, size_t repeats)
 {
     OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
     span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
     span.addAttribute("clickhouse.cluster", this->storage.cluster_name);
     span.addAttribute("clickhouse.distributed", this->storage.getStorageID().getFullNameNotQuoted());
     span.addAttribute("clickhouse.remote", [this]() { return storage.remote_database + "." + storage.remote_table; });
-    span.addAttribute("clickhouse.rows", [&block]() { return std::to_string(block.rows()); });
-    span.addAttribute("clickhouse.bytes", [&block]() { return std::to_string(block.bytes()); });
+    span.addAttribute("clickhouse.rows", [&block]() { return toString(block.rows()); });
+    span.addAttribute("clickhouse.bytes", [&block]() { return toString(block.bytes()); });
 
     try
     {
@@ -677,7 +679,7 @@ void DistributedSink::writeToLocal(const Cluster::ShardInfo& shard_info, const B
 }
 
 
-void DistributedSink::writeToShard(const Cluster::ShardInfo& shard_info, const Block & block, const std::vector<std::string> & dir_names)
+void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const Block & block, const std::vector<std::string> & dir_names)
 {
     OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
     span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
diff --git a/src/Storages/Distributed/DistributedSink.h b/src/Storages/Distributed/DistributedSink.h
index 5d7a5268865..af0c64cbd78 100644
--- a/src/Storages/Distributed/DistributedSink.h
+++ b/src/Storages/Distributed/DistributedSink.h
@@ -69,9 +69,9 @@ private:
     Block removeSuperfluousColumns(Block block) const;
 
     /// Increments finished_writings_count after each repeat.
-    void writeToLocal(const Cluster::ShardInfo& shard_info, const Block & block, size_t repeats);
+    void writeToLocal(const Cluster::ShardInfo & shard_info, const Block & block, size_t repeats);
 
-    void writeToShard(const Cluster::ShardInfo& shard_info, const Block & block, const std::vector<std::string> & dir_names);
+    void writeToShard(const Cluster::ShardInfo & shard_info, const Block & block, const std::vector<std::string> & dir_names);
 
 
     /// Performs synchronous insertion to remote nodes. If timeout_exceeded flag was set, throws.

From 18db90dcfc968d38b7ad293691e4981c36690a5e Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Tue, 23 Aug 2022 12:17:36 +0800
Subject: [PATCH 13/87] Record errors while reading text formats (CSV, TSV).

---
 src/Client/ClientBase.cpp                     | 114 +++++++++---
 src/Client/ClientBase.h                       |   1 +
 src/Core/Settings.h                           |   1 +
 src/Processors/Formats/IInputFormat.h         |  26 +++
 src/Processors/Formats/IRowInputFormat.cpp    |  26 ++-
 src/Processors/Formats/IRowInputFormat.h      |   2 +
 .../Impl/ParallelParsingInputFormat.cpp       |  28 ++-
 .../RowInputFormatWithDiagnosticInfo.cpp      |  57 ++++++
 .../RowInputFormatWithDiagnosticInfo.h        |   1 +
 src/Storages/StorageFile.cpp                  | 167 ++++++++++++++----
 10 files changed, 360 insertions(+), 63 deletions(-)

diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index 465d4358e91..34be04af43d 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -5,6 +5,7 @@
 #include <filesystem>
 #include <map>
 #include <unordered_map>
+#include <fstream>
 
 #include <Common/DateLUT.h>
 #include <Common/LocalDate.h>
@@ -1129,8 +1130,65 @@ void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr pars
     {
         /// If structure was received (thus, server has not thrown an exception),
         /// send our data with that structure.
+        bool change = false;
+        if (global_context->getInsertionTable().empty() && parsed_insert_query.table)
+        {
+            String table = parsed_insert_query.table->as<ASTIdentifier &>().shortName();
+            if (!table.empty())
+            {
+                change = true;
+                String database = parsed_insert_query.database ? parsed_insert_query.database->as<ASTIdentifier &>().shortName() : "";
+                global_context->setInsertionTable(StorageID(database, table));
+            }
+        }
+
         sendData(sample, columns_description, parsed_query);
         receiveEndOfQuery();
+
+        if (change)
+            global_context->setInsertionTable(StorageID::createEmpty());
+    }
+}
+
+
+void ClientBase::errorRowsSink(const QueryPipeline & pipeline)
+{
+    const auto & processors = pipeline.getProcessors();
+    if (!processors.empty())
+    {
+        IInputFormat * input_format = dynamic_cast<IInputFormat *>(processors[0].get());
+        if (!input_format || input_format->isEmptyMultiErrorRows())
+            return;
+
+        String file_name = global_context->getSettingsRef().input_format_record_errors_table_or_file_name;
+        String database_name;
+        String table_name;
+        try
+        {
+            table_name = global_context->getInsertionTable().getTableName();
+            database_name = global_context->getInsertionTable().getDatabaseName();
+        }
+        catch (...)
+        {
+            /// Ignore
+        }
+
+        const auto & multi_error_rows = input_format->getMultiErrorRows();
+
+        std::ofstream out(file_name, std::ios::app);
+        if (out.is_open())
+        {
+            for (const auto & error_rows : multi_error_rows)
+                for (const auto & error_row : error_rows)
+                    out << "Time: " << error_row.time << "\nDatabase: " << database_name << "\nTable: " << table_name
+                        << "\nOffset: " << error_row.offset << "\nReason: \n"
+                        << error_row.reason << "\nRaw data: " << error_row.raw_data << "\n------\n";
+            out.close();
+        }
+        else
+        {
+            std::cout << "Failed to open file that records error rows." << std::endl;
+        }
     }
 }
 
@@ -1294,39 +1352,49 @@ try
     PullingAsyncPipelineExecutor executor(pipeline);
 
     Block block;
-    while (executor.pull(block))
+    try
     {
-        if (!cancelled && QueryInterruptHandler::cancelled())
+        while (executor.pull(block))
         {
-            cancelQuery();
-            executor.cancel();
-            return;
-        }
+            if (!cancelled && QueryInterruptHandler::cancelled())
+            {
+                cancelQuery();
+                executor.cancel();
+                return;
+            }
 
-        /// Check if server send Log packet
-        receiveLogsAndProfileEvents(parsed_query);
+            /// Check if server send Log packet
+            receiveLogsAndProfileEvents(parsed_query);
 
-        /// Check if server send Exception packet
-        auto packet_type = connection->checkPacket(0);
-        if (packet_type && *packet_type == Protocol::Server::Exception)
-        {
-            /**
+            /// Check if server send Exception packet
+            auto packet_type = connection->checkPacket(0);
+            if (packet_type && *packet_type == Protocol::Server::Exception)
+            {
+                /**
              * We're exiting with error, so it makes sense to kill the
              * input stream without waiting for it to complete.
              */
-            executor.cancel();
-            return;
+                executor.cancel();
+                return;
+            }
+
+            if (block)
+            {
+                connection->sendData(block, /* name */"", /* scalar */false);
+                processed_rows += block.rows();
+            }
         }
 
-        if (block)
-        {
-            connection->sendData(block, /* name */"", /* scalar */false);
-            processed_rows += block.rows();
-        }
+        errorRowsSink(pipeline);
+
+        if (!have_more_data)
+            connection->sendData({}, "", false);
+    }
+    catch (...)
+    {
+        errorRowsSink(pipeline);
+        throw;
     }
-
-    if (!have_more_data)
-        connection->sendData({}, "", false);
 }
 catch (...)
 {
diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h
index 6b19c1b8e02..fda38acca99 100644
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@@ -132,6 +132,7 @@ private:
     void onEndOfStream();
     void onProfileEvents(Block & block);
 
+    void errorRowsSink(const QueryPipeline & pipeline);
     void sendData(Block & sample, const ColumnsDescription & columns_description, ASTPtr parsed_query);
     void sendDataFrom(ReadBuffer & buf, Block & sample,
                       const ColumnsDescription & columns_description, ASTPtr parsed_query, bool have_more_data = false);
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 1a2b9e42a25..9894f7cfca4 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -773,6 +773,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     \
     M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
     M(Float, input_format_allow_errors_ratio, 0, "Maximum relative amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
+    M(String, input_format_record_errors_table_or_file_name, "input_format_error_rows", "Name of the table or file used to record error rows while reading text formats (CSV, TSV).", 0) \
     \
     M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \
     M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \
diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h
index e2bd208764e..0587c7daa85 100644
--- a/src/Processors/Formats/IInputFormat.h
+++ b/src/Processors/Formats/IInputFormat.h
@@ -11,6 +11,16 @@ namespace DB
 
 using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;
 
+struct InputFormatErrorRow
+{
+    String time;
+    size_t offset;
+    String reason;
+    String raw_data;
+};
+
+using InputFormatErrorRows = std::vector<InputFormatErrorRow>;
+
 /** Input format is a source, that reads data from ReadBuffer.
   */
 class IInputFormat : public ISource
@@ -55,6 +65,19 @@ public:
 
     void addBuffer(std::unique_ptr<ReadBuffer> buffer) { owned_buffers.emplace_back(std::move(buffer)); }
 
+    void addErrorRow(InputFormatErrorRow && error_row) { error_rows.emplace_back(error_row); }
+    InputFormatErrorRows & getErrorRows() { return error_rows; }
+
+    void addErrorRows(InputFormatErrorRows & source_error_rows)
+    {
+        multi_error_rows.emplace_back(InputFormatErrorRows());
+        multi_error_rows.back().swap(source_error_rows);
+    }
+    const std::list<InputFormatErrorRows> & getMultiErrorRows() { return multi_error_rows; }
+
+    bool isEmptyErrorRows() { return error_rows.empty(); }
+    bool isEmptyMultiErrorRows() { return multi_error_rows.empty(); }
+
 protected:
     ColumnMappingPtr column_mapping{};
 
@@ -62,6 +85,9 @@ private:
     /// Number of currently parsed chunk (if parallel parsing is enabled)
     size_t current_unit_number = 0;
 
+    InputFormatErrorRows error_rows;
+    std::list<InputFormatErrorRows> multi_error_rows;
+
     std::vector<std::unique_ptr<ReadBuffer>> owned_buffers;
 };
 
diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp
index 3df22002b82..57e9a8c3c30 100644
--- a/src/Processors/Formats/IRowInputFormat.cpp
+++ b/src/Processors/Formats/IRowInputFormat.cpp
@@ -1,7 +1,9 @@
 #include <Processors/Formats/IRowInputFormat.h>
 #include <DataTypes/ObjectUtils.h>
-#include <IO/WriteHelpers.h>    // toString
 #include <IO/WithFileName.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/Operators.h>
+#include <base/chrono_io.h>
 #include <Common/logger_useful.h>
 
 
@@ -104,6 +106,28 @@ Chunk IRowInputFormat::generate()
             }
             catch (Exception & e)
             {
+                /// Record error info for this row
+                String diagnostic;
+                String raw_data;
+                try
+                {
+                    std::tie(diagnostic, raw_data) = getDiagnosticAndRawData();
+                }
+                catch (const Exception & exception)
+                {
+                    diagnostic = "Cannot get diagnostic: " + exception.message();
+                    raw_data = "Cannot get raw data: " + exception.message();
+                }
+                catch (...)
+                {
+                    /// Error while trying to obtain verbose diagnostic. Ok to ignore.
+                }
+                trimRight(diagnostic, '\n');
+
+                auto now_time = time(nullptr);
+
+                addErrorRow(InputFormatErrorRow{to_string(now_time), total_rows, diagnostic, raw_data});
+
                 /// Logic for possible skipping of errors.
 
                 if (!isParseError(e.code()))
diff --git a/src/Processors/Formats/IRowInputFormat.h b/src/Processors/Formats/IRowInputFormat.h
index 87caadd93da..7ef766c1d85 100644
--- a/src/Processors/Formats/IRowInputFormat.h
+++ b/src/Processors/Formats/IRowInputFormat.h
@@ -65,6 +65,8 @@ protected:
     ///  and collect as much as possible diagnostic information about error.
     /// If not implemented, returns empty string.
     virtual std::string getDiagnosticInfo() { return {}; }
+    /// Get diagnostic info and raw data for a row
+    virtual std::tuple<std::string, std::string> getDiagnosticAndRawData() { return std::make_tuple("", ""); }
 
     const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
 
diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
index 318bcaed466..79023d557b8 100644
--- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
@@ -88,12 +88,30 @@ void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr threa
         // We don't know how many blocks will be. So we have to read them all
         // until an empty block occurred.
         Chunk chunk;
-        while (!parsing_finished && (chunk = parser.getChunk()))
+        try
         {
-            /// Variable chunk is moved, but it is not really used in the next iteration.
-            /// NOLINTNEXTLINE(bugprone-use-after-move, hicpp-invalid-access-moved)
-            unit.chunk_ext.chunk.emplace_back(std::move(chunk));
-            unit.chunk_ext.block_missing_values.emplace_back(parser.getMissingValues());
+            while (!parsing_finished && (chunk = parser.getChunk()))
+            {
+                /// Variable chunk is moved, but it is not really used in the next iteration.
+                /// NOLINTNEXTLINE(bugprone-use-after-move, hicpp-invalid-access-moved)
+                unit.chunk_ext.chunk.emplace_back(std::move(chunk));
+                unit.chunk_ext.block_missing_values.emplace_back(parser.getMissingValues());
+            }
+
+            if (!input_format->isEmptyErrorRows())
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                addErrorRows(input_format->getErrorRows());
+            }
+        }
+        catch (...)
+        {
+            if (!input_format->isEmptyErrorRows())
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                addErrorRows(input_format->getErrorRows());
+            }
+            throw;
         }
 
         /// Extract column_mapping from first parser to propagate it to others
diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
index f4568830720..9e5d15680d9 100644
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
@@ -91,6 +91,63 @@ String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
     return out.str();
 }
 
+std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawData()
+{
+    WriteBufferFromOwnString out_diag;
+    WriteBufferFromOwnString out_data;
+
+    if (in->eof())
+    {
+        out_diag << "Buffer has gone, cannot extract information about what has been parsed.";
+        out_data << "Buffer has gone, cannot extract information about what has been parsed.";
+        return std::make_tuple(out_diag.str(), out_data.str());
+    }
+
+    const auto & header = getPort().getHeader();
+    MutableColumns columns = header.cloneEmptyColumns();
+
+    /// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
+    size_t bytes_read_at_start_of_buffer = in->count() - in->offset();
+    if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
+    {
+        out_diag << "Could not collect diagnostic because two last rows are not in buffer (rare case)";
+        out_data << "Could not collect raw data because two last rows are not in buffer (rare case)";
+        return std::make_tuple(out_diag.str(), out_data.str());
+    }
+
+    max_length_of_column_name = 0;
+    for (size_t i = 0; i < header.columns(); ++i)
+        if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
+            max_length_of_column_name = header.safeGetByPosition(i).name.size();
+
+    max_length_of_data_type_name = 0;
+    for (size_t i = 0; i < header.columns(); ++i)
+        if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
+            max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
+
+    /// Roll back the cursor to the beginning of the current row and parse all over again.
+
+    if (in->buffer().size() < offset_of_current_row)
+    {
+        out_diag << "Could not collect diagnostic because parsing of data has not started.";
+        out_data << "Could not collect raw data because parsing of data has not started.";
+        return std::make_tuple(out_diag.str(), out_data.str());
+    }
+
+    in->position() = in->buffer().begin() + offset_of_current_row;
+
+    char * data = in->position();
+    while (*data != '\n' && *data != '\r' && *data != '\0' && data < in->buffer().end())
+    {
+        out_data << *data;
+        ++data;
+    }
+
+    parseRowAndPrintDiagnosticInfo(columns, out_diag);
+
+    return std::make_tuple(out_diag.str(), out_data.str());
+}
+
 bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name,
                                                                               const DataTypePtr & type,
                                                                               IColumn & column,
diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
index 5bad24cd482..14f11e91ff0 100644
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
@@ -15,6 +15,7 @@ public:
     RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_);
 
     String getDiagnosticInfo() override;
+    std::tuple<String, String> getDiagnosticAndRawData() override;
 
     void resetParser() override;
 
diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp
index 0788a9f73d8..a3ab0bc7c17 100644
--- a/src/Storages/StorageFile.cpp
+++ b/src/Storages/StorageFile.cpp
@@ -8,6 +8,7 @@
 
 #include <Interpreters/Context.h>
 #include <Interpreters/evaluateConstantExpression.h>
+#include <Interpreters/executeQuery.h>
 
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTIdentifier_fwd.h>
@@ -19,6 +20,7 @@
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBufferFromFile.h>
 #include <IO/WriteHelpers.h>
+#include <IO/ReadBufferFromString.h>
 
 #include <DataTypes/DataTypeString.h>
 #include <Formats/FormatFactory.h>
@@ -45,6 +47,7 @@
 #include <unistd.h>
 #include <re2/re2.h>
 #include <filesystem>
+#include <fstream>
 
 
 namespace fs = std::filesystem;
@@ -509,6 +512,92 @@ public:
         return storage->getName();
     }
 
+    void errorRowsSink()
+    {
+        const auto & processors = pipeline->getProcessors();
+        if (!processors.empty())
+        {
+            IInputFormat * input_format = dynamic_cast<IInputFormat *>(processors[0].get());
+            if (!input_format || (input_format->isEmptyErrorRows() && input_format->isEmptyMultiErrorRows()))
+                return;
+
+            String table_or_file_name = context->getSettingsRef().input_format_record_errors_table_or_file_name;
+            String database_name;
+            String table_name;
+            try
+            {
+                table_name = context->getInsertionTable().getTableName();
+                database_name = context->getInsertionTable().getDatabaseName();
+            }
+            catch (...)
+            {
+                /// Ignore
+            }
+
+            if (context->hasGlobalContext() && (context->getGlobalContext()->getApplicationType() == Context::ApplicationType::SERVER))
+            {
+                InputFormatErrorRows & error_rows = input_format->getErrorRows();
+                Poco::Logger * log = &Poco::Logger::get("StorageFileSource");
+
+                try
+                {
+                    auto copy_context = Context::createCopy(context);
+
+                    String query = "create table if not exists " + table_or_file_name
+                        + "(time DateTime, database String, table String, offset UInt32, reason String, raw_data String) engine MergeTree "
+                          "order by time "
+                          "comment 'Record error rows while reading text formats (like CSV, TSV).'";
+                    executeQuery(query, copy_context, true);
+
+                    copy_context->setInternalQuery(true);
+                    copy_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
+                    copy_context->setCurrentQueryId("");
+
+                    String insert_query = "insert into " + table_or_file_name + " values ";
+                    for (auto & error_row : error_rows)
+                    {
+                        for (char & ch : error_row.reason)
+                        {
+                            if (ch == '\'')
+                                ch = '_';
+                        }
+                        insert_query = insert_query + "('" + error_row.time + "','" + database_name + "','" + table_name + "',"
+                            + toString(error_row.offset) + ",'" + error_row.reason + "','" + error_row.raw_data + "'),";
+                    }
+
+                    ReadBufferFromString insert_read_buf(insert_query);
+                    String dummy_string;
+                    WriteBufferFromString insert_write_buf(dummy_string);
+
+                    executeQuery(insert_read_buf, insert_write_buf, false, copy_context, nullptr, {});
+                }
+                catch (Exception & e)
+                {
+                    LOG_INFO(log, "Error occurred while executing a query that handles error rows: {}", e.message());
+                }
+                catch (...)
+                {
+                    LOG_INFO(log, "Unknown error occurred while executing a query that handles error rows.");
+                }
+            }
+            else
+            {
+                const auto & multi_error_rows = input_format->getMultiErrorRows();
+
+                std::ofstream out(table_or_file_name, std::ios::app);
+                if (out.is_open())
+                {
+                    for (const auto & error_rows : multi_error_rows)
+                        for (const auto & error_row : error_rows)
+                            out << "Time: " << error_row.time << "\nDatabase: " << database_name << "\nTable: " << table_name
+                                << "\nOffset: " << error_row.offset << "\nReason: \n"
+                                << error_row.reason << "\nRaw data: " << error_row.raw_data << "\n------\n";
+                    out.close();
+                }
+            }
+        }
+    }
+
     Chunk generate() override
     {
         while (!finished_generate)
@@ -556,46 +645,56 @@ public:
             }
 
             Chunk chunk;
-            if (reader->pull(chunk))
+            try
             {
-                UInt64 num_rows = chunk.getNumRows();
-
-                /// Enrich with virtual columns.
-                if (files_info->need_path_column)
+                if (reader->pull(chunk))
                 {
-                    auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, current_path);
-                    chunk.addColumn(column->convertToFullColumnIfConst());
-                }
+                    UInt64 num_rows = chunk.getNumRows();
 
-                if (files_info->need_file_column)
-                {
-                    size_t last_slash_pos = current_path.find_last_of('/');
-                    auto file_name = current_path.substr(last_slash_pos + 1);
-
-                    auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, std::move(file_name));
-                    chunk.addColumn(column->convertToFullColumnIfConst());
-                }
-
-                if (num_rows)
-                {
-                    auto bytes_per_row = std::ceil(static_cast<double>(chunk.bytes()) / num_rows);
-                    size_t total_rows_approx = std::ceil(static_cast<double>(files_info->total_bytes_to_read) / bytes_per_row);
-                    total_rows_approx_accumulated += total_rows_approx;
-                    ++total_rows_count_times;
-                    total_rows_approx = total_rows_approx_accumulated / total_rows_count_times;
-
-                    /// We need to add diff, because total_rows_approx is incremental value.
-                    /// It would be more correct to send total_rows_approx as is (not a diff),
-                    /// but incrementation of total_rows_to_read does not allow that.
-                    /// A new field can be introduces for that to be sent to client, but it does not worth it.
-                    if (total_rows_approx > total_rows_approx_prev)
+                    /// Enrich with virtual columns.
+                    if (files_info->need_path_column)
                     {
-                        size_t diff = total_rows_approx - total_rows_approx_prev;
-                        addTotalRowsApprox(diff);
-                        total_rows_approx_prev = total_rows_approx;
+                        auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, current_path);
+                        chunk.addColumn(column->convertToFullColumnIfConst());
                     }
+
+                    if (files_info->need_file_column)
+                    {
+                        size_t last_slash_pos = current_path.find_last_of('/');
+                        auto file_name = current_path.substr(last_slash_pos + 1);
+
+                        auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, std::move(file_name));
+                        chunk.addColumn(column->convertToFullColumnIfConst());
+                    }
+
+                    if (num_rows)
+                    {
+                        auto bytes_per_row = std::ceil(static_cast<double>(chunk.bytes()) / num_rows);
+                        size_t total_rows_approx = std::ceil(static_cast<double>(files_info->total_bytes_to_read) / bytes_per_row);
+                        total_rows_approx_accumulated += total_rows_approx;
+                        ++total_rows_count_times;
+                        total_rows_approx = total_rows_approx_accumulated / total_rows_count_times;
+
+                        /// We need to add diff, because total_rows_approx is incremental value.
+                        /// It would be more correct to send total_rows_approx as is (not a diff),
+                        /// but incrementation of total_rows_to_read does not allow that.
+                        /// A new field can be introduces for that to be sent to client, but it does not worth it.
+                        if (total_rows_approx > total_rows_approx_prev)
+                        {
+                            size_t diff = total_rows_approx - total_rows_approx_prev;
+                            addTotalRowsApprox(diff);
+                            total_rows_approx_prev = total_rows_approx;
+                        }
+                    }
+                    return chunk;
                 }
-                return chunk;
+
+                errorRowsSink();
+            }
+            catch (...)
+            {
+                errorRowsSink();
+                throw;
             }
 
             /// Read only once for file descriptor.

From 3c43b8ac432c38a5dce49b5e1b629db5b7ae7abd Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Sat, 27 Aug 2022 00:07:55 +0800
Subject: [PATCH 14/87] just use file to store errors

---
 src/Client/ClientBase.cpp    | 32 ++++++++++-----
 src/Core/Settings.h          |  2 +-
 src/Storages/StorageFile.cpp | 78 ++++++++++++++++--------------------
 3 files changed, 57 insertions(+), 55 deletions(-)

diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index 34be04af43d..93af751e2b9 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -5,7 +5,6 @@
 #include <filesystem>
 #include <map>
 #include <unordered_map>
-#include <fstream>
 
 #include <Common/DateLUT.h>
 #include <Common/LocalDate.h>
@@ -1160,7 +1159,13 @@ void ClientBase::errorRowsSink(const QueryPipeline & pipeline)
         if (!input_format || input_format->isEmptyMultiErrorRows())
             return;
 
-        String file_name = global_context->getSettingsRef().input_format_record_errors_table_or_file_name;
+        String errors_file_path = global_context->getSettingsRef().input_format_record_errors_file_name;
+        if (global_context->getSettingsRef().isChanged("input_format_record_errors_file_name"))
+        {
+            while (fs::exists(errors_file_path))
+                errors_file_path += "_new";
+        }
+
         String database_name;
         String table_name;
         try
@@ -1174,20 +1179,25 @@ void ClientBase::errorRowsSink(const QueryPipeline & pipeline)
         }
 
         const auto & multi_error_rows = input_format->getMultiErrorRows();
-
-        std::ofstream out(file_name, std::ios::app);
-        if (out.is_open())
+        try
         {
+            WriteBufferFromFile out(errors_file_path);
             for (const auto & error_rows : multi_error_rows)
+            {
                 for (const auto & error_row : error_rows)
-                    out << "Time: " << error_row.time << "\nDatabase: " << database_name << "\nTable: " << table_name
-                        << "\nOffset: " << error_row.offset << "\nReason: \n"
-                        << error_row.reason << "\nRaw data: " << error_row.raw_data << "\n------\n";
-            out.close();
+                {
+                    String row_in_file = "Time: " + error_row.time + "\nDatabase: " + database_name + "\nTable: " + table_name
+                        + "\nOffset: " + toString(error_row.offset) + "\nReason: \n" + error_row.reason
+                        + "\nRaw data: " + error_row.raw_data + "\n------\n";
+                    out.write(row_in_file.data(), row_in_file.size());
+                }
+            }
+            out.sync();
         }
-        else
+        catch (...)
         {
-            std::cout << "Failed to open file that records error rows." << std::endl;
+            std::cout << "Caught Exception " + getCurrentExceptionMessage(false) + " while writing the Errors file " + errors_file_path
+                      << std::endl;
         }
     }
 }
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 9894f7cfca4..5e011a43409 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -773,7 +773,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     \
     M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
     M(Float, input_format_allow_errors_ratio, 0, "Maximum relative amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
-    M(String, input_format_record_errors_table_or_file_name, "input_format_error_rows", "Name of the table or file used to record error rows while reading text formats (CSV, TSV).", 0) \
+    M(String, input_format_record_errors_file_name, "_input_format_error_rows_", "Name of the file used to record errors while reading text formats (CSV, TSV).", 0) \
     \
     M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \
     M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \
diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp
index a3ab0bc7c17..e77b5349f19 100644
--- a/src/Storages/StorageFile.cpp
+++ b/src/Storages/StorageFile.cpp
@@ -8,7 +8,6 @@
 
 #include <Interpreters/Context.h>
 #include <Interpreters/evaluateConstantExpression.h>
-#include <Interpreters/executeQuery.h>
 
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTIdentifier_fwd.h>
@@ -20,7 +19,6 @@
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBufferFromFile.h>
 #include <IO/WriteHelpers.h>
-#include <IO/ReadBufferFromString.h>
 
 #include <DataTypes/DataTypeString.h>
 #include <Formats/FormatFactory.h>
@@ -47,7 +45,6 @@
 #include <unistd.h>
 #include <re2/re2.h>
 #include <filesystem>
-#include <fstream>
 
 
 namespace fs = std::filesystem;
@@ -521,7 +518,17 @@ public:
             if (!input_format || (input_format->isEmptyErrorRows() && input_format->isEmptyMultiErrorRows()))
                 return;
 
-            String table_or_file_name = context->getSettingsRef().input_format_record_errors_table_or_file_name;
+            String errors_file_name = context->getSettingsRef().input_format_record_errors_file_name;
+            String errors_file_path = context->getUserFilesPath();
+            if (!errors_file_path.empty())
+                trimLeft(errors_file_name, '/');
+            errors_file_path += errors_file_name;
+            if (context->getSettingsRef().isChanged("input_format_record_errors_file_name"))
+            {
+                while (fs::exists(errors_file_path))
+                    errors_file_path += "_new";
+            }
+
             String database_name;
             String table_name;
             try
@@ -536,63 +543,48 @@ public:
 
             if (context->hasGlobalContext() && (context->getGlobalContext()->getApplicationType() == Context::ApplicationType::SERVER))
             {
-                InputFormatErrorRows & error_rows = input_format->getErrorRows();
                 Poco::Logger * log = &Poco::Logger::get("StorageFileSource");
 
+                InputFormatErrorRows & error_rows = input_format->getErrorRows();
                 try
                 {
-                    auto copy_context = Context::createCopy(context);
-
-                    String query = "create table if not exists " + table_or_file_name
-                        + "(time DateTime, database String, table String, offset UInt32, reason String, raw_data String) engine MergeTree "
-                          "order by time "
-                          "comment 'Record error rows while reading text formats (like CSV, TSV).'";
-                    executeQuery(query, copy_context, true);
-
-                    copy_context->setInternalQuery(true);
-                    copy_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
-                    copy_context->setCurrentQueryId("");
-
-                    String insert_query = "insert into " + table_or_file_name + " values ";
+                    WriteBufferFromFile out(errors_file_path);
                     for (auto & error_row : error_rows)
                     {
-                        for (char & ch : error_row.reason)
-                        {
-                            if (ch == '\'')
-                                ch = '_';
-                        }
-                        insert_query = insert_query + "('" + error_row.time + "','" + database_name + "','" + table_name + "',"
-                            + toString(error_row.offset) + ",'" + error_row.reason + "','" + error_row.raw_data + "'),";
+                        String row_in_file = "Time: " + error_row.time + "\nDatabase: " + database_name + "\nTable: " + table_name
+                            + "\nOffset: " + toString(error_row.offset) + "\nReason: \n" + error_row.reason
+                            + "\nRaw data: " + error_row.raw_data + "\n------\n";
+                        out.write(row_in_file.data(), row_in_file.size());
                     }
-
-                    ReadBufferFromString insert_read_buf(insert_query);
-                    String dummy_string;
-                    WriteBufferFromString insert_write_buf(dummy_string);
-
-                    executeQuery(insert_read_buf, insert_write_buf, false, copy_context, nullptr, {});
-                }
-                catch (Exception & e)
-                {
-                    LOG_INFO(log, "Error occurred while executing a query that handles error rows: {}", e.message());
+                    out.sync();
                 }
                 catch (...)
                 {
-                    LOG_INFO(log, "Unknown error occurred while executing a query that handles error rows.");
+                    LOG_ERROR(
+                        log, "Caught Exception {} while writing the Errors file {}", getCurrentExceptionMessage(false), errors_file_path);
                 }
             }
             else
             {
                 const auto & multi_error_rows = input_format->getMultiErrorRows();
-
-                std::ofstream out(table_or_file_name, std::ios::app);
-                if (out.is_open())
+                try
                 {
+                    WriteBufferFromFile out(errors_file_path);
                     for (const auto & error_rows : multi_error_rows)
+                    {
                         for (const auto & error_row : error_rows)
-                            out << "Time: " << error_row.time << "\nDatabase: " << database_name << "\nTable: " << table_name
-                                << "\nOffset: " << error_row.offset << "\nReason: \n"
-                                << error_row.reason << "\nRaw data: " << error_row.raw_data << "\n------\n";
-                    out.close();
+                        {
+                            String row_in_file = "Time: " + error_row.time + "\nDatabase: " + database_name + "\nTable: " + table_name
+                                + "\nOffset: " + toString(error_row.offset) + "\nReason: \n" + error_row.reason
+                                + "\nRaw data: " + error_row.raw_data + "\n------\n";
+                            out.write(row_in_file.data(), row_in_file.size());
+                        }
+                    }
+                    out.sync();
+                }
+                catch (...)
+                {
+                    /// Ignore
                 }
             }
         }

From 0f788d98f5b1d28e5551e1e11a2e1aff662eb45d Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Mon, 5 Sep 2022 23:42:49 +0800
Subject: [PATCH 15/87] new implementation

---
 src/Client/ClientBase.cpp                     | 109 +++---------
 src/Client/ClientBase.h                       |   1 -
 src/Core/Settings.h                           |   3 +-
 src/Core/SettingsEnums.cpp                    |   3 +
 src/Core/SettingsEnums.h                      |   2 +
 src/Formats/FormatFactory.cpp                 |  28 ++-
 src/Formats/FormatSettings.h                  |   5 +
 src/Processors/Formats/IInputFormat.cpp       |  83 +++++++++
 src/Processors/Formats/IInputFormat.h         |  80 ++++++---
 src/Processors/Formats/IRowInputFormat.cpp    |   4 +-
 .../Impl/ParallelParsingInputFormat.cpp       |  29 +---
 .../RowInputFormatWithDiagnosticInfo.cpp      |  94 ++++------
 .../RowInputFormatWithDiagnosticInfo.h        |   1 +
 src/Storages/StorageFile.cpp                  | 163 ++++--------------
 14 files changed, 280 insertions(+), 325 deletions(-)

diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index 93af751e2b9..42a33e7c0d2 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -1150,59 +1150,6 @@ void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr pars
 }
 
 
-void ClientBase::errorRowsSink(const QueryPipeline & pipeline)
-{
-    const auto & processors = pipeline.getProcessors();
-    if (!processors.empty())
-    {
-        IInputFormat * input_format = dynamic_cast<IInputFormat *>(processors[0].get());
-        if (!input_format || input_format->isEmptyMultiErrorRows())
-            return;
-
-        String errors_file_path = global_context->getSettingsRef().input_format_record_errors_file_name;
-        if (global_context->getSettingsRef().isChanged("input_format_record_errors_file_name"))
-        {
-            while (fs::exists(errors_file_path))
-                errors_file_path += "_new";
-        }
-
-        String database_name;
-        String table_name;
-        try
-        {
-            table_name = global_context->getInsertionTable().getTableName();
-            database_name = global_context->getInsertionTable().getDatabaseName();
-        }
-        catch (...)
-        {
-            /// Ignore
-        }
-
-        const auto & multi_error_rows = input_format->getMultiErrorRows();
-        try
-        {
-            WriteBufferFromFile out(errors_file_path);
-            for (const auto & error_rows : multi_error_rows)
-            {
-                for (const auto & error_row : error_rows)
-                {
-                    String row_in_file = "Time: " + error_row.time + "\nDatabase: " + database_name + "\nTable: " + table_name
-                        + "\nOffset: " + toString(error_row.offset) + "\nReason: \n" + error_row.reason
-                        + "\nRaw data: " + error_row.raw_data + "\n------\n";
-                    out.write(row_in_file.data(), row_in_file.size());
-                }
-            }
-            out.sync();
-        }
-        catch (...)
-        {
-            std::cout << "Caught Exception " + getCurrentExceptionMessage(false) + " while writing the Errors file " + errors_file_path
-                      << std::endl;
-        }
-    }
-}
-
-
 void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_description, ASTPtr parsed_query)
 {
     /// Get columns description from variable or (if it was empty) create it from sample.
@@ -1362,49 +1309,39 @@ try
     PullingAsyncPipelineExecutor executor(pipeline);
 
     Block block;
-    try
+    while (executor.pull(block))
     {
-        while (executor.pull(block))
+        if (!cancelled && QueryInterruptHandler::cancelled())
         {
-            if (!cancelled && QueryInterruptHandler::cancelled())
-            {
-                cancelQuery();
-                executor.cancel();
-                return;
-            }
+            cancelQuery();
+            executor.cancel();
+            return;
+        }
 
-            /// Check if server send Log packet
-            receiveLogsAndProfileEvents(parsed_query);
+        /// Check if server send Log packet
+        receiveLogsAndProfileEvents(parsed_query);
 
-            /// Check if server send Exception packet
-            auto packet_type = connection->checkPacket(0);
-            if (packet_type && *packet_type == Protocol::Server::Exception)
-            {
-                /**
+        /// Check if server send Exception packet
+        auto packet_type = connection->checkPacket(0);
+        if (packet_type && *packet_type == Protocol::Server::Exception)
+        {
+            /**
              * We're exiting with error, so it makes sense to kill the
              * input stream without waiting for it to complete.
              */
-                executor.cancel();
-                return;
-            }
-
-            if (block)
-            {
-                connection->sendData(block, /* name */"", /* scalar */false);
-                processed_rows += block.rows();
-            }
+            executor.cancel();
+            return;
         }
 
-        errorRowsSink(pipeline);
+        if (block)
+        {
+            connection->sendData(block, /* name */"", /* scalar */false);
+            processed_rows += block.rows();
+        }
+    }
 
-        if (!have_more_data)
-            connection->sendData({}, "", false);
-    }
-    catch (...)
-    {
-        errorRowsSink(pipeline);
-        throw;
-    }
+    if (!have_more_data)
+        connection->sendData({}, "", false);
 }
 catch (...)
 {
diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h
index fda38acca99..6b19c1b8e02 100644
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@@ -132,7 +132,6 @@ private:
     void onEndOfStream();
     void onProfileEvents(Block & block);
 
-    void errorRowsSink(const QueryPipeline & pipeline);
     void sendData(Block & sample, const ColumnsDescription & columns_description, ASTPtr parsed_query);
     void sendDataFrom(ReadBuffer & buf, Block & sample,
                       const ColumnsDescription & columns_description, ASTPtr parsed_query, bool have_more_data = false);
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 5e011a43409..05eb182dfd5 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -773,7 +773,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     \
     M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
     M(Float, input_format_allow_errors_ratio, 0, "Maximum relative amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
-    M(String, input_format_record_errors_file_name, "_input_format_error_rows_", "Name of the file used to record errors while reading text formats (CSV, TSV).", 0) \
+    M(String, input_format_record_errors_file_path, "_input_format_error_rows_", "Path of the file used to record errors while reading text formats (CSV, TSV).", 0) \
+    M(ErrorsOutputFormat, errors_output_format, "CSV", "Method to write Errors to text output. Possible values: 'CSV'.", 0) \
     \
     M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \
     M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index daa678c0141..a2bf1b46ee8 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -158,5 +158,8 @@ IMPLEMENT_SETTING_ENUM(MsgPackUUIDRepresentation , ErrorCodes::BAD_ARGUMENTS,
                         {"str", FormatSettings::MsgPackUUIDRepresentation::STR},
                         {"ext", FormatSettings::MsgPackUUIDRepresentation::EXT}})
 
+IMPLEMENT_SETTING_ENUM(ErrorsOutputFormat, ErrorCodes::BAD_ARGUMENTS,
+    {{"CSV", FormatSettings::ErrorsOutputFormat::CSV}})
+
 
 }
diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index b5e908defc7..4d750167895 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -183,4 +183,6 @@ DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule)
 
 DECLARE_SETTING_ENUM_WITH_RENAME(MsgPackUUIDRepresentation, FormatSettings::MsgPackUUIDRepresentation)
 
+DECLARE_SETTING_ENUM_WITH_RENAME(ErrorsOutputFormat, FormatSettings::ErrorsOutputFormat)
+
 }
diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 780b6bb6201..0dd303c0f36 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -224,6 +224,21 @@ InputFormatPtr FormatFactory::getInput(
             parallel_parsing = false;
     }
 
+    String errors_file_path = context->getSettingsRef().input_format_record_errors_file_path;
+    bool is_changed = context->getSettingsRef().isChanged("input_format_record_errors_file_path");
+    auto output_format = context->getSettingsRef().errors_output_format;
+    String database_name;
+    String table_name;
+    try
+    {
+        table_name = context->getInsertionTable().getTableName();
+        database_name = context->getInsertionTable().getDatabaseName();
+    }
+    catch (...)
+    {
+        /// Ignore
+    }
+
     if (parallel_parsing)
     {
         const auto & input_getter = getCreators(name).input_creator;
@@ -243,11 +258,22 @@ InputFormatPtr FormatFactory::getInput(
         ParallelParsingInputFormat::Params params{
             buf, sample, parser_creator, file_segmentation_engine, name, settings.max_threads, settings.min_chunk_bytes_for_parallel_parsing,
                context->getApplicationType() == Context::ApplicationType::SERVER};
-        return std::make_shared<ParallelParsingInputFormat>(params);
+        auto format = std::make_shared<ParallelParsingInputFormat>(params);
+        format->setErrorsLogger(std::make_shared<ParallelInputFormatErrorsLogger>(
+            context->getApplicationType(),
+            context->getUserFilesPath(),
+            errors_file_path,
+            is_changed,
+            output_format,
+            database_name,
+            table_name));
+        return format;
     }
 
 
     auto format = getInputFormat(name, buf, sample, context, max_block_size, format_settings);
+    format->setErrorsLogger(std::make_shared<InputFormatErrorsLogger>(
+        context->getApplicationType(), context->getUserFilesPath(), errors_file_path, is_changed, output_format, database_name, table_name));
     return format;
 }
 
diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h
index 9466a64590d..59603d91e98 100644
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@@ -292,6 +292,11 @@ struct FormatSettings
         bool use_replace = false;
         bool quote_names = true;
     } sql_insert;
+
+    enum class ErrorsOutputFormat
+    {
+        CSV,
+    };
 };
 
 }
diff --git a/src/Processors/Formats/IInputFormat.cpp b/src/Processors/Formats/IInputFormat.cpp
index 674a4affc46..f5cba35206f 100644
--- a/src/Processors/Formats/IInputFormat.cpp
+++ b/src/Processors/Formats/IInputFormat.cpp
@@ -1,10 +1,93 @@
 #include <Processors/Formats/IInputFormat.h>
+#include <Processors/Formats/Impl/CSVRowOutputFormat.h>
 #include <IO/ReadBuffer.h>
+#include <IO/WriteHelpers.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
 
 
 namespace DB
 {
 
+InputFormatErrorsLogger::InputFormatErrorsLogger(
+    Context::ApplicationType app_type,
+    const String & user_files_path,
+    String & path_in_setting,
+    bool is_changed,
+    FormatSettings::ErrorsOutputFormat output_format,
+    const String & database_,
+    const String & table_)
+    : database(database_), table(table_)
+{
+    if (app_type == Context::ApplicationType::SERVER)
+    {
+        trimLeft(path_in_setting, '/');
+    }
+    else if (!is_changed)
+    {
+        path_in_setting = "/tmp/" + path_in_setting;
+    }
+    errors_file_path = user_files_path + path_in_setting;
+    if (is_changed)
+    {
+        while (fs::exists(errors_file_path))
+        {
+            errors_file_path += "_new";
+        }
+    }
+    write_buf = std::make_shared<WriteBufferFromFile>(errors_file_path);
+
+    Block header{
+        {std::make_shared<DataTypeString>(), "time"},
+        {std::make_shared<DataTypeString>(), "database"},
+        {std::make_shared<DataTypeString>(), "table"},
+        {std::make_shared<DataTypeUInt32>(), "offset"},
+        {std::make_shared<DataTypeString>(), "reason"},
+        {std::make_shared<DataTypeString>(), "raw_data"}};
+    FormatSettings format_settings;
+    RowOutputFormatParams out_params;
+
+    if (output_format == FormatSettings::ErrorsOutputFormat::CSV)
+        writer = std::make_shared<CSVRowOutputFormat>(*write_buf, header, false, false, out_params, format_settings);
+}
+
+void InputFormatErrorsLogger::logErrorImpl(ErrorEntry entry)
+{
+    for (auto & ch : entry.reason)
+    {
+        if (ch == '\"')
+            ch = '\'';
+    }
+    for (auto & ch : entry.raw_data)
+    {
+        if (ch == '\"')
+            ch = '\'';
+    }
+    Block error{
+        {DataTypeString().createColumnConst(1, entry.time)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "time"},
+        {DataTypeString().createColumnConst(1, database)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "database"},
+        {DataTypeString().createColumnConst(1, table)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "table"},
+        {DataTypeUInt32().createColumnConst(1, entry.offset)->convertToFullColumnIfConst(), std::make_shared<DataTypeUInt32>(), "offset"},
+        {DataTypeString().createColumnConst(1, entry.reason)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "reason"},
+        {DataTypeString().createColumnConst(1, entry.raw_data)->convertToFullColumnIfConst(),
+         std::make_shared<DataTypeString>(),
+         "raw_data"}};
+    writer->write(error);
+}
+
+void InputFormatErrorsLogger::logError(ErrorEntry entry)
+{
+    logErrorImpl(entry);
+}
+
+ParallelInputFormatErrorsLogger::~ParallelInputFormatErrorsLogger() = default;
+
+void ParallelInputFormatErrorsLogger::logError(ErrorEntry entry)
+{
+    std::lock_guard lock(write_mutex);
+    logErrorImpl(entry);
+}
+
 IInputFormat::IInputFormat(Block header, ReadBuffer & in_)
     : ISource(std::move(header)), in(&in_)
 {
diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h
index 0587c7daa85..198d37688df 100644
--- a/src/Processors/Formats/IInputFormat.h
+++ b/src/Processors/Formats/IInputFormat.h
@@ -2,6 +2,7 @@
 
 #include <Processors/ISource.h>
 #include <IO/ReadBuffer.h>
+#include <IO/WriteBufferFromFile.h>
 #include <Interpreters/Context.h>
 #include <Formats/ColumnMapping.h>
 
@@ -11,15 +12,64 @@ namespace DB
 
 using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;
 
-struct InputFormatErrorRow
+class InputFormatErrorsLogger
 {
-    String time;
-    size_t offset;
-    String reason;
-    String raw_data;
+public:
+    struct ErrorEntry
+    {
+        String time;
+        size_t offset;
+        String reason;
+        String raw_data;
+    };
+
+    InputFormatErrorsLogger(
+        Context::ApplicationType app_type,
+        const String & user_files_path,
+        String & path_in_setting,
+        bool is_changed,
+        FormatSettings::ErrorsOutputFormat output_format,
+        const String & database_,
+        const String & table_);
+
+    virtual ~InputFormatErrorsLogger() = default;
+
+    virtual void logError(ErrorEntry entry);
+    void logErrorImpl(ErrorEntry entry);
+
+private:
+    String errors_file_path;
+    std::shared_ptr<WriteBufferFromFile> write_buf;
+    OutputFormatPtr writer;
+
+    String database;
+    String table;
 };
 
-using InputFormatErrorRows = std::vector<InputFormatErrorRow>;
+using InputFormatErrorsLoggerPtr = std::shared_ptr<InputFormatErrorsLogger>;
+
+class ParallelInputFormatErrorsLogger : public InputFormatErrorsLogger
+{
+public:
+    ParallelInputFormatErrorsLogger(
+        Context::ApplicationType app_type,
+        const String & user_files_path,
+        String & path_in_setting,
+        bool is_changed,
+        FormatSettings::ErrorsOutputFormat output_format,
+        const String & database_,
+        const String & table_)
+        : InputFormatErrorsLogger(app_type, user_files_path, path_in_setting, is_changed, output_format, database_, table_)
+    {
+    }
+
+    ~ParallelInputFormatErrorsLogger() override;
+
+    void logError(ErrorEntry entry) override;
+
+private:
+    std::mutex write_mutex;
+};
 
 /** Input format is a source, that reads data from ReadBuffer.
   */
@@ -65,29 +115,17 @@ public:
 
     void addBuffer(std::unique_ptr<ReadBuffer> buffer) { owned_buffers.emplace_back(std::move(buffer)); }
 
-    void addErrorRow(InputFormatErrorRow && error_row) { error_rows.emplace_back(error_row); }
-    InputFormatErrorRows & getErrorRows() { return error_rows; }
-
-    void addErrorRows(InputFormatErrorRows & source_error_rows)
-    {
-        multi_error_rows.emplace_back(InputFormatErrorRows());
-        multi_error_rows.back().swap(source_error_rows);
-    }
-    const std::list<InputFormatErrorRows> & getMultiErrorRows() { return multi_error_rows; }
-
-    bool isEmptyErrorRows() { return error_rows.empty(); }
-    bool isEmptyMultiErrorRows() { return multi_error_rows.empty(); }
+    void setErrorsLogger(const InputFormatErrorsLoggerPtr & errors_logger_) { errors_logger = errors_logger_; }
 
 protected:
     ColumnMappingPtr column_mapping{};
 
+    InputFormatErrorsLoggerPtr errors_logger;
+
 private:
     /// Number of currently parsed chunk (if parallel parsing is enabled)
     size_t current_unit_number = 0;
 
-    InputFormatErrorRows error_rows;
-    std::list<InputFormatErrorRows> multi_error_rows;
-
     std::vector<std::unique_ptr<ReadBuffer>> owned_buffers;
 };
 
diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp
index 57e9a8c3c30..49b94a255e3 100644
--- a/src/Processors/Formats/IRowInputFormat.cpp
+++ b/src/Processors/Formats/IRowInputFormat.cpp
@@ -122,11 +122,13 @@ Chunk IRowInputFormat::generate()
                 {
                     /// Error while trying to obtain verbose diagnostic. Ok to ignore.
                 }
+                trimLeft(diagnostic, '\n');
                 trimRight(diagnostic, '\n');
 
                 auto now_time = time(nullptr);
 
-                addErrorRow(InputFormatErrorRow{to_string(now_time), total_rows, diagnostic, raw_data});
+                if (errors_logger)
+                    errors_logger->logError(InputFormatErrorsLogger::ErrorEntry{to_string(now_time), total_rows, diagnostic, raw_data});
 
                 /// Logic for possible skipping of errors.
 
diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
index 79023d557b8..e0693b489bd 100644
--- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
@@ -75,6 +75,7 @@ void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr threa
 
         InputFormatPtr input_format = internal_parser_creator(read_buffer);
         input_format->setCurrentUnitNumber(current_ticket_number);
+        input_format->setErrorsLogger(errors_logger);
         InternalParser parser(input_format);
 
         unit.chunk_ext.chunk.clear();
@@ -88,30 +89,12 @@ void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr threa
         // We don't know how many blocks will be. So we have to read them all
         // until an empty block occurred.
         Chunk chunk;
-        try
+        while (!parsing_finished && (chunk = parser.getChunk()))
         {
-            while (!parsing_finished && (chunk = parser.getChunk()))
-            {
-                /// Variable chunk is moved, but it is not really used in the next iteration.
-                /// NOLINTNEXTLINE(bugprone-use-after-move, hicpp-invalid-access-moved)
-                unit.chunk_ext.chunk.emplace_back(std::move(chunk));
-                unit.chunk_ext.block_missing_values.emplace_back(parser.getMissingValues());
-            }
-
-            if (!input_format->isEmptyErrorRows())
-            {
-                std::lock_guard<std::mutex> lock(mutex);
-                addErrorRows(input_format->getErrorRows());
-            }
-        }
-        catch (...)
-        {
-            if (!input_format->isEmptyErrorRows())
-            {
-                std::lock_guard<std::mutex> lock(mutex);
-                addErrorRows(input_format->getErrorRows());
-            }
-            throw;
+            /// Variable chunk is moved, but it is not really used in the next iteration.
+            /// NOLINTNEXTLINE(bugprone-use-after-move, hicpp-invalid-access-moved)
+            unit.chunk_ext.chunk.emplace_back(std::move(chunk));
+            unit.chunk_ext.block_missing_values.emplace_back(parser.getMissingValues());
         }
 
         /// Extract column_mapping from first parser to propagate it to others
diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
index 9e5d15680d9..31e2175a1be 100644
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
@@ -35,12 +35,15 @@ void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
     offset_of_current_row = in->offset();
 }
 
-String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
+std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawDataImpl(bool is_errors_record)
 {
-    if (in->eof())
-        return "Buffer has gone, cannot extract information about what has been parsed.";
+    WriteBufferFromOwnString out_diag;
+    WriteBufferFromOwnString out_data;
 
-    WriteBufferFromOwnString out;
+    if (in->eof())
+        return std::make_tuple(
+            "Buffer has gone, cannot extract information about what has been parsed.",
+            "Buffer has gone, cannot extract information about what has been parsed.");
 
     const auto & header = getPort().getHeader();
     MutableColumns columns = header.cloneEmptyColumns();
@@ -49,8 +52,9 @@ String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
     size_t bytes_read_at_start_of_buffer = in->count() - in->offset();
     if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
     {
-        out << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
-        return out.str();
+        out_diag << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
+        out_data << "Could not collect raw data because two last rows aren't in buffer (rare case)\n";
+        return std::make_tuple(out_diag.str(), out_data.str());
     }
 
     max_length_of_column_name = 0;
@@ -65,77 +69,26 @@ String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
 
     /// Roll back the cursor to the beginning of the previous or current row and parse all over again. But now we derive detailed information.
 
-    if (offset_of_prev_row <= in->buffer().size())
+    if (!is_errors_record && offset_of_prev_row <= in->buffer().size())
     {
         in->position() = in->buffer().begin() + offset_of_prev_row;
 
-        out << "\nRow " << (row_num - 1) << ":\n";
-        if (!parseRowAndPrintDiagnosticInfo(columns, out))
-            return out.str();
+        out_diag << "\nRow " << (row_num - 1) << ":\n";
+        if (!parseRowAndPrintDiagnosticInfo(columns, out_diag))
+            return std::make_tuple(out_diag.str(), out_data.str());
     }
     else
     {
         if (in->buffer().size() < offset_of_current_row)
         {
-            out << "Could not print diagnostic info because parsing of data hasn't started.\n";
-            return out.str();
+            out_diag << "Could not print diagnostic info because parsing of data hasn't started.\n";
+            out_data << "Could not collect raw data because parsing of data hasn't started.\n";
+            return std::make_tuple(out_diag.str(), out_data.str());
         }
 
         in->position() = in->buffer().begin() + offset_of_current_row;
     }
 
-    out << "\nRow " << row_num << ":\n";
-    parseRowAndPrintDiagnosticInfo(columns, out);
-    out << "\n";
-
-    return out.str();
-}
-
-std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawData()
-{
-    WriteBufferFromOwnString out_diag;
-    WriteBufferFromOwnString out_data;
-
-    if (in->eof())
-    {
-        out_diag << "Buffer has gone, cannot extract information about what has been parsed.";
-        out_data << "Buffer has gone, cannot extract information about what has been parsed.";
-        return std::make_tuple(out_diag.str(), out_data.str());
-    }
-
-    const auto & header = getPort().getHeader();
-    MutableColumns columns = header.cloneEmptyColumns();
-
-    /// It is possible to display detailed diagnostics only if the last and next to last rows are still in the read buffer.
-    size_t bytes_read_at_start_of_buffer = in->count() - in->offset();
-    if (bytes_read_at_start_of_buffer != bytes_read_at_start_of_buffer_on_prev_row)
-    {
-        out_diag << "Could not collect diagnostic because two last rows are not in buffer (rare case)";
-        out_data << "Could not collect raw data because two last rows are not in buffer (rare case)";
-        return std::make_tuple(out_diag.str(), out_data.str());
-    }
-
-    max_length_of_column_name = 0;
-    for (size_t i = 0; i < header.columns(); ++i)
-        if (header.safeGetByPosition(i).name.size() > max_length_of_column_name)
-            max_length_of_column_name = header.safeGetByPosition(i).name.size();
-
-    max_length_of_data_type_name = 0;
-    for (size_t i = 0; i < header.columns(); ++i)
-        if (header.safeGetByPosition(i).type->getName().size() > max_length_of_data_type_name)
-            max_length_of_data_type_name = header.safeGetByPosition(i).type->getName().size();
-
-    /// Roll back the cursor to the beginning of the current row and parse all over again.
-
-    if (in->buffer().size() < offset_of_current_row)
-    {
-        out_diag << "Could not collect diagnostic because parsing of data has not started.";
-        out_data << "Could not collect raw data because parsing of data has not started.";
-        return std::make_tuple(out_diag.str(), out_data.str());
-    }
-
-    in->position() = in->buffer().begin() + offset_of_current_row;
-
     char * data = in->position();
     while (*data != '\n' && *data != '\r' && *data != '\0' && data < in->buffer().end())
     {
@@ -143,11 +96,24 @@ std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRaw
         ++data;
     }
 
+    out_diag << "\nRow " << row_num << ":\n";
     parseRowAndPrintDiagnosticInfo(columns, out_diag);
+    out_diag << "\n";
 
     return std::make_tuple(out_diag.str(), out_data.str());
 }
 
+String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
+{
+    auto diagnostic_and_raw_data = getDiagnosticAndRawDataImpl(false);
+    return std::get<0>(diagnostic_and_raw_data);
+}
+
+std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawData()
+{
+    return getDiagnosticAndRawDataImpl(true);
+}
+
 bool RowInputFormatWithDiagnosticInfo::deserializeFieldAndPrintDiagnosticInfo(const String & col_name,
                                                                               const DataTypePtr & type,
                                                                               IColumn & column,
diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
index 14f11e91ff0..97f0a2599fa 100644
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
@@ -14,6 +14,7 @@ class RowInputFormatWithDiagnosticInfo : public IRowInputFormat
 public:
     RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_);
 
+    std::tuple<String, String> getDiagnosticAndRawDataImpl(bool is_errors_record);
     String getDiagnosticInfo() override;
     std::tuple<String, String> getDiagnosticAndRawData() override;
 
diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp
index e77b5349f19..0788a9f73d8 100644
--- a/src/Storages/StorageFile.cpp
+++ b/src/Storages/StorageFile.cpp
@@ -509,87 +509,6 @@ public:
         return storage->getName();
     }
 
-    void errorRowsSink()
-    {
-        const auto & processors = pipeline->getProcessors();
-        if (!processors.empty())
-        {
-            IInputFormat * input_format = dynamic_cast<IInputFormat *>(processors[0].get());
-            if (!input_format || (input_format->isEmptyErrorRows() && input_format->isEmptyMultiErrorRows()))
-                return;
-
-            String errors_file_name = context->getSettingsRef().input_format_record_errors_file_name;
-            String errors_file_path = context->getUserFilesPath();
-            if (!errors_file_path.empty())
-                trimLeft(errors_file_name, '/');
-            errors_file_path += errors_file_name;
-            if (context->getSettingsRef().isChanged("input_format_record_errors_file_name"))
-            {
-                while (fs::exists(errors_file_path))
-                    errors_file_path += "_new";
-            }
-
-            String database_name;
-            String table_name;
-            try
-            {
-                table_name = context->getInsertionTable().getTableName();
-                database_name = context->getInsertionTable().getDatabaseName();
-            }
-            catch (...)
-            {
-                /// Ignore
-            }
-
-            if (context->hasGlobalContext() && (context->getGlobalContext()->getApplicationType() == Context::ApplicationType::SERVER))
-            {
-                Poco::Logger * log = &Poco::Logger::get("StorageFileSource");
-
-                InputFormatErrorRows & error_rows = input_format->getErrorRows();
-                try
-                {
-                    WriteBufferFromFile out(errors_file_path);
-                    for (auto & error_row : error_rows)
-                    {
-                        String row_in_file = "Time: " + error_row.time + "\nDatabase: " + database_name + "\nTable: " + table_name
-                            + "\nOffset: " + toString(error_row.offset) + "\nReason: \n" + error_row.reason
-                            + "\nRaw data: " + error_row.raw_data + "\n------\n";
-                        out.write(row_in_file.data(), row_in_file.size());
-                    }
-                    out.sync();
-                }
-                catch (...)
-                {
-                    LOG_ERROR(
-                        log, "Caught Exception {} while writing the Errors file {}", getCurrentExceptionMessage(false), errors_file_path);
-                }
-            }
-            else
-            {
-                const auto & multi_error_rows = input_format->getMultiErrorRows();
-                try
-                {
-                    WriteBufferFromFile out(errors_file_path);
-                    for (const auto & error_rows : multi_error_rows)
-                    {
-                        for (const auto & error_row : error_rows)
-                        {
-                            String row_in_file = "Time: " + error_row.time + "\nDatabase: " + database_name + "\nTable: " + table_name
-                                + "\nOffset: " + toString(error_row.offset) + "\nReason: \n" + error_row.reason
-                                + "\nRaw data: " + error_row.raw_data + "\n------\n";
-                            out.write(row_in_file.data(), row_in_file.size());
-                        }
-                    }
-                    out.sync();
-                }
-                catch (...)
-                {
-                    /// Ignore
-                }
-            }
-        }
-    }
-
     Chunk generate() override
     {
         while (!finished_generate)
@@ -637,56 +556,46 @@ public:
             }
 
             Chunk chunk;
-            try
+            if (reader->pull(chunk))
             {
-                if (reader->pull(chunk))
+                UInt64 num_rows = chunk.getNumRows();
+
+                /// Enrich with virtual columns.
+                if (files_info->need_path_column)
                 {
-                    UInt64 num_rows = chunk.getNumRows();
-
-                    /// Enrich with virtual columns.
-                    if (files_info->need_path_column)
-                    {
-                        auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, current_path);
-                        chunk.addColumn(column->convertToFullColumnIfConst());
-                    }
-
-                    if (files_info->need_file_column)
-                    {
-                        size_t last_slash_pos = current_path.find_last_of('/');
-                        auto file_name = current_path.substr(last_slash_pos + 1);
-
-                        auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, std::move(file_name));
-                        chunk.addColumn(column->convertToFullColumnIfConst());
-                    }
-
-                    if (num_rows)
-                    {
-                        auto bytes_per_row = std::ceil(static_cast<double>(chunk.bytes()) / num_rows);
-                        size_t total_rows_approx = std::ceil(static_cast<double>(files_info->total_bytes_to_read) / bytes_per_row);
-                        total_rows_approx_accumulated += total_rows_approx;
-                        ++total_rows_count_times;
-                        total_rows_approx = total_rows_approx_accumulated / total_rows_count_times;
-
-                        /// We need to add diff, because total_rows_approx is incremental value.
-                        /// It would be more correct to send total_rows_approx as is (not a diff),
-                        /// but incrementation of total_rows_to_read does not allow that.
-                        /// A new field can be introduces for that to be sent to client, but it does not worth it.
-                        if (total_rows_approx > total_rows_approx_prev)
-                        {
-                            size_t diff = total_rows_approx - total_rows_approx_prev;
-                            addTotalRowsApprox(diff);
-                            total_rows_approx_prev = total_rows_approx;
-                        }
-                    }
-                    return chunk;
+                    auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, current_path);
+                    chunk.addColumn(column->convertToFullColumnIfConst());
                 }
 
-                errorRowsSink();
-            }
-            catch (...)
-            {
-                errorRowsSink();
-                throw;
+                if (files_info->need_file_column)
+                {
+                    size_t last_slash_pos = current_path.find_last_of('/');
+                    auto file_name = current_path.substr(last_slash_pos + 1);
+
+                    auto column = DataTypeLowCardinality{std::make_shared<DataTypeString>()}.createColumnConst(num_rows, std::move(file_name));
+                    chunk.addColumn(column->convertToFullColumnIfConst());
+                }
+
+                if (num_rows)
+                {
+                    auto bytes_per_row = std::ceil(static_cast<double>(chunk.bytes()) / num_rows);
+                    size_t total_rows_approx = std::ceil(static_cast<double>(files_info->total_bytes_to_read) / bytes_per_row);
+                    total_rows_approx_accumulated += total_rows_approx;
+                    ++total_rows_count_times;
+                    total_rows_approx = total_rows_approx_accumulated / total_rows_count_times;
+
+                    /// We need to add diff, because total_rows_approx is incremental value.
+                    /// It would be more correct to send total_rows_approx as is (not a diff),
+                    /// but incrementation of total_rows_to_read does not allow that.
+                    /// A new field can be introduces for that to be sent to client, but it does not worth it.
+                    if (total_rows_approx > total_rows_approx_prev)
+                    {
+                        size_t diff = total_rows_approx - total_rows_approx_prev;
+                        addTotalRowsApprox(diff);
+                        total_rows_approx_prev = total_rows_approx;
+                    }
+                }
+                return chunk;
             }
 
             /// Read only once for file descriptor.

From f21ab12d8e364e0fce760bb5228a8ef00c8c0c66 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 6 Sep 2022 22:21:31 +0800
Subject: [PATCH 16/87] Use sleep to wait for flush

---
 ...7_opentelemetry_insert_on_distributed_table.sh | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
index 6f766e9f3bb..b9b5dd2d424 100755
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -33,15 +33,11 @@ SET insert_distributed_sync=0;
 INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1 VALUES(1),(2);
 "
 
-# Wait complete of ASYNC INSERT on distributed table
-wait
+# Wait 10s to complete of ASYNC INSERT on distributed table and flush of system.opentelemetry_span_log
+sleep 10
 
 # Check log
 ${CLICKHOUSE_CLIENT} -nq "
--- Flush opentelemetry span log on all nodes
-SET distributed_ddl_output_mode = 'none';
-SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
-
 -- Above INSERT will insert data to two shards respectively, so there will be two spans generated
 SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%writeToLocal%';
 SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%processFile%';
@@ -64,12 +60,11 @@ SET insert_distributed_sync=1;
 INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1 VALUES(1),(2);
 "
 
+# Wait 10s to flush system.opentelemetry_span_log
+sleep 10
+
 # Check log
 ${CLICKHOUSE_CLIENT} -nq "
--- Flush opentelemetry span log on all nodes
-SET distributed_ddl_output_mode = 'none';
-SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
-
 -- Above INSERT will insert data to two shards in the same flow, so there should be two spans generated with the same operation name
 SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%runWritingJob%';
 "

From 329f31e7ab2f2436a9dc45ec001a06563893852d Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Thu, 8 Sep 2022 11:38:10 +0800
Subject: [PATCH 17/87] Address review comments

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 src/Storages/Distributed/DirectoryMonitor.cpp |  6 +--
 ...entelemetry_insert_on_distributed_table.sh | 41 ++++++++-----------
 2 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/src/Storages/Distributed/DirectoryMonitor.cpp b/src/Storages/Distributed/DirectoryMonitor.cpp
index f84ddeb4f5e..b3d2494c7e1 100644
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@@ -141,7 +141,7 @@ namespace
         size_t bytes = 0;
 
         UInt32 shard_num = 0;
-        std::string cluster_name;
+        std::string cluster;
         std::string distributed_table;
         std::string remote_table;
 
@@ -203,7 +203,7 @@ namespace
             if (header_buf.hasPendingData())
             {
               readVarUInt(distributed_header.shard_num, header_buf);
-              readStringBinary(distributed_header.cluster_name, header_buf);
+              readStringBinary(distributed_header.cluster, header_buf);
               readStringBinary(distributed_header.distributed_table, header_buf);
               readStringBinary(distributed_header.remote_table, header_buf);
             }
@@ -638,7 +638,7 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
             distributed_header.client_info.client_trace_context,
             this->storage.getContext()->getOpenTelemetrySpanLog());
         thread_trace_context->root_span.addAttribute("clickhouse.shard_num", distributed_header.shard_num);
-        thread_trace_context->root_span.addAttribute("clickhouse.cluster", distributed_header.cluster_name);
+        thread_trace_context->root_span.addAttribute("clickhouse.cluster", distributed_header.cluster);
         thread_trace_context->root_span.addAttribute("clickhouse.distributed", distributed_header.distributed_table);
         thread_trace_context->root_span.addAttribute("clickhouse.remote", distributed_header.remote_table);
         thread_trace_context->root_span.addAttribute("clickhouse.rows", distributed_header.rows);
diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
index b9b5dd2d424..319f0151b1d 100755
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Tags: distributed
+# Tags: no-fasttest, distributed
 
 set -ue
 
@@ -10,9 +10,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 . "$CURDIR"/../shell_config.sh
 
 
-${CLICKHOUSE_CLIENT} -nq "
-SET distributed_ddl_output_mode = 'none';
-
+${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
 SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
 TRUNCATE TABLE IF EXISTS system.opentelemetry_span_log ON CLUSTER test_cluster_two_shards;
 
@@ -28,16 +26,17 @@ CREATE TABLE default.local_opentelemetry ON CLUSTER test_cluster_two_shards (key
 # Do test with opentelemetry enabled
 #
 ${CLICKHOUSE_CLIENT} -nq "
--- Make sure it's async
-SET insert_distributed_sync=0;
-INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1 VALUES(1),(2);
+INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1, insert_distributed_sync=0 VALUES(1),(2);
 "
 
-# Wait 10s to complete of ASYNC INSERT on distributed table and flush of system.opentelemetry_span_log
-sleep 10
-
 # Check log
-${CLICKHOUSE_CLIENT} -nq "
+${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
+-- Make sure INSERT on distributed finishes
+SYSTEM FLUSH DISTRIBUTED default.dist_opentelemetry ON CLUSTER test_cluster_two_shards;
+
+-- Make sure opentelemetry span log flushed
+SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
+
 -- Above INSERT will insert data to two shards respectively, so there will be two spans generated
 SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%writeToLocal%';
 SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%processFile%';
@@ -47,24 +46,17 @@ SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_s
 # INSERT SYNC test
 # Do test with opentelemetry enabled and in SYNC mode
 #
-${CLICKHOUSE_CLIENT} -nq "
-
+${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
 -- Clear log
-SET distributed_ddl_output_mode = 'none';
 TRUNCATE TABLE IF EXISTS system.opentelemetry_span_log ON CLUSTER test_cluster_two_shards;
 
--- Make sure it's SYNC
-SET insert_distributed_sync=1;
-
--- INSERT test
-INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1 VALUES(1),(2);
+INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1, insert_distributed_sync=1 VALUES(1),(2);
 "
 
-# Wait 10s to flush system.opentelemetry_span_log
-sleep 10
-
 # Check log
-${CLICKHOUSE_CLIENT} -nq "
+${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
+SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
+
 -- Above INSERT will insert data to two shards in the same flow, so there should be two spans generated with the same operation name
 SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%runWritingJob%';
 "
@@ -72,8 +64,7 @@ SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_s
 #
 # Cleanup
 #
-${CLICKHOUSE_CLIENT} -nq "
-SET distributed_ddl_output_mode = 'none';
+${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
 DROP TABLE default.dist_opentelemetry  ON CLUSTER test_cluster_two_shards;
 DROP TABLE default.local_opentelemetry ON CLUSTER test_cluster_two_shards;
 "

From a9863805222bbb90152b57540a3577caa104096a Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchen@apache.org>
Date: Thu, 8 Sep 2022 17:25:29 +0800
Subject: [PATCH 18/87] Update src/Storages/Distributed/DirectoryMonitor.cpp

Co-authored-by: Azat Khuzhin <a3at.mail@gmail.com>
---
 src/Storages/Distributed/DirectoryMonitor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Storages/Distributed/DirectoryMonitor.cpp b/src/Storages/Distributed/DirectoryMonitor.cpp
index b3d2494c7e1..e8d48431a9e 100644
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@@ -202,10 +202,10 @@ namespace
 
             if (header_buf.hasPendingData())
             {
-              readVarUInt(distributed_header.shard_num, header_buf);
-              readStringBinary(distributed_header.cluster, header_buf);
-              readStringBinary(distributed_header.distributed_table, header_buf);
-              readStringBinary(distributed_header.remote_table, header_buf);
+                readVarUInt(distributed_header.shard_num, header_buf);
+                readStringBinary(distributed_header.cluster, header_buf);
+                readStringBinary(distributed_header.distributed_table, header_buf);
+                readStringBinary(distributed_header.remote_table, header_buf);
             }
 
             /// Add handling new data here, for example:

From 469ceaa156c93240a35d6219364361fc61116aa8 Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Fri, 9 Sep 2022 00:37:18 +0800
Subject: [PATCH 19/87] code optimization

---
 src/Client/ClientBase.cpp                     | 29 +++----
 src/Client/ClientBase.h                       |  2 +
 src/Core/Settings.h                           |  4 +-
 src/Core/SettingsEnums.cpp                    |  3 -
 src/Core/SettingsEnums.h                      |  2 -
 src/Formats/FormatFactory.cpp                 | 33 ++------
 src/Formats/FormatSettings.h                  |  5 --
 src/Interpreters/Context.h                    |  1 +
 src/Processors/Formats/IInputFormat.cpp       | 83 -------------------
 src/Processors/Formats/IInputFormat.h         | 61 +-------------
 src/Processors/Formats/IRowInputFormat.cpp    | 52 ++++++------
 src/Processors/Formats/IRowInputFormat.h      |  4 +-
 .../Formats/InputFormatErrorsLogger.cpp       | 81 ++++++++++++++++++
 .../Formats/InputFormatErrorsLogger.h         | 54 ++++++++++++
 .../RowInputFormatWithDiagnosticInfo.cpp      | 16 ++--
 .../RowInputFormatWithDiagnosticInfo.h        |  4 +-
 16 files changed, 205 insertions(+), 229 deletions(-)
 create mode 100644 src/Processors/Formats/InputFormatErrorsLogger.cpp
 create mode 100644 src/Processors/Formats/InputFormatErrorsLogger.h

diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index 42a33e7c0d2..7d05cbb0681 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -1080,6 +1080,20 @@ bool ClientBase::receiveSampleBlock(Block & out, ColumnsDescription & columns_de
 }
 
 
+void ClientBase::setInsertionTable(const ASTInsertQuery & insert_query)
+{
+    if (!global_context->hasInsertionTable() && insert_query.table)
+    {
+        String table = insert_query.table->as<ASTIdentifier &>().shortName();
+        if (!table.empty())
+        {
+            String database = insert_query.database ? insert_query.database->as<ASTIdentifier &>().shortName() : "";
+            global_context->setInsertionTable(StorageID(database, table));
+        }
+    }
+}
+
+
 void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr parsed_query)
 {
     auto query = query_to_execute;
@@ -1129,23 +1143,10 @@ void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr pars
     {
         /// If structure was received (thus, server has not thrown an exception),
         /// send our data with that structure.
-        bool change = false;
-        if (global_context->getInsertionTable().empty() && parsed_insert_query.table)
-        {
-            String table = parsed_insert_query.table->as<ASTIdentifier &>().shortName();
-            if (!table.empty())
-            {
-                change = true;
-                String database = parsed_insert_query.database ? parsed_insert_query.database->as<ASTIdentifier &>().shortName() : "";
-                global_context->setInsertionTable(StorageID(database, table));
-            }
-        }
+        setInsertionTable(parsed_insert_query);
 
         sendData(sample, columns_description, parsed_query);
         receiveEndOfQuery();
-
-        if (change)
-            global_context->setInsertionTable(StorageID::createEmpty());
     }
 }
 
diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h
index 6b19c1b8e02..17165721a53 100644
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@@ -113,6 +113,8 @@ protected:
         std::vector<Arguments> & external_tables_arguments,
         std::vector<Arguments> & hosts_and_ports_arguments) = 0;
 
+    void setInsertionTable(const ASTInsertQuery & insert_query);
+
 
 private:
     void receiveResult(ASTPtr parsed_query);
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 05eb182dfd5..969d9bab82b 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -773,8 +773,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     \
     M(UInt64, input_format_allow_errors_num, 0, "Maximum absolute amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
     M(Float, input_format_allow_errors_ratio, 0, "Maximum relative amount of errors while reading text formats (like CSV, TSV). In case of error, if at least absolute or relative amount of errors is lower than corresponding value, will skip until next line and continue.", 0) \
-    M(String, input_format_record_errors_file_path, "_input_format_error_rows_", "Path of the file used to record errors while reading text formats (CSV, TSV).", 0) \
-    M(ErrorsOutputFormat, errors_output_format, "CSV", "Method to write Errors to text output. Possible values: 'CSV'.", 0) \
+    M(String, input_format_record_errors_file_path, "", "Path of the file used to record errors while reading text formats (CSV, TSV).", 0) \
+    M(String, errors_output_format, "CSV", "Method to write Errors to text output.", 0) \
     \
     M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \
     M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index a2bf1b46ee8..daa678c0141 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -158,8 +158,5 @@ IMPLEMENT_SETTING_ENUM(MsgPackUUIDRepresentation , ErrorCodes::BAD_ARGUMENTS,
                         {"str", FormatSettings::MsgPackUUIDRepresentation::STR},
                         {"ext", FormatSettings::MsgPackUUIDRepresentation::EXT}})
 
-IMPLEMENT_SETTING_ENUM(ErrorsOutputFormat, ErrorCodes::BAD_ARGUMENTS,
-    {{"CSV", FormatSettings::ErrorsOutputFormat::CSV}})
-
 
 }
diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index 4d750167895..b5e908defc7 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -183,6 +183,4 @@ DECLARE_SETTING_ENUM_WITH_RENAME(EscapingRule, FormatSettings::EscapingRule)
 
 DECLARE_SETTING_ENUM_WITH_RENAME(MsgPackUUIDRepresentation, FormatSettings::MsgPackUUIDRepresentation)
 
-DECLARE_SETTING_ENUM_WITH_RENAME(ErrorsOutputFormat, FormatSettings::ErrorsOutputFormat)
-
 }
diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 0dd303c0f36..48d95a1973d 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -224,21 +224,6 @@ InputFormatPtr FormatFactory::getInput(
             parallel_parsing = false;
     }
 
-    String errors_file_path = context->getSettingsRef().input_format_record_errors_file_path;
-    bool is_changed = context->getSettingsRef().isChanged("input_format_record_errors_file_path");
-    auto output_format = context->getSettingsRef().errors_output_format;
-    String database_name;
-    String table_name;
-    try
-    {
-        table_name = context->getInsertionTable().getTableName();
-        database_name = context->getInsertionTable().getDatabaseName();
-    }
-    catch (...)
-    {
-        /// Ignore
-    }
-
     if (parallel_parsing)
     {
         const auto & input_getter = getCreators(name).input_creator;
@@ -259,21 +244,19 @@ InputFormatPtr FormatFactory::getInput(
             buf, sample, parser_creator, file_segmentation_engine, name, settings.max_threads, settings.min_chunk_bytes_for_parallel_parsing,
                context->getApplicationType() == Context::ApplicationType::SERVER};
         auto format = std::make_shared<ParallelParsingInputFormat>(params);
-        format->setErrorsLogger(std::make_shared<ParallelInputFormatErrorsLogger>(
-            context->getApplicationType(),
-            context->getUserFilesPath(),
-            errors_file_path,
-            is_changed,
-            output_format,
-            database_name,
-            table_name));
+        if (!context->getSettingsRef().input_format_record_errors_file_path.toString().empty())
+        {
+            format->setErrorsLogger(std::make_shared<ParallelInputFormatErrorsLogger>(context));
+        }
         return format;
     }
 
 
     auto format = getInputFormat(name, buf, sample, context, max_block_size, format_settings);
-    format->setErrorsLogger(std::make_shared<InputFormatErrorsLogger>(
-        context->getApplicationType(), context->getUserFilesPath(), errors_file_path, is_changed, output_format, database_name, table_name));
+    if (!context->getSettingsRef().input_format_record_errors_file_path.toString().empty())
+    {
+        format->setErrorsLogger(std::make_shared<InputFormatErrorsLogger>(context));
+    }
     return format;
 }
 
diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h
index 59603d91e98..9466a64590d 100644
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@@ -292,11 +292,6 @@ struct FormatSettings
         bool use_replace = false;
         bool quote_names = true;
     } sql_insert;
-
-    enum class ErrorsOutputFormat
-    {
-        CSV,
-    };
 };
 
 }
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 2997fc370bf..bf2660bf4d7 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -612,6 +612,7 @@ public:
 
     void killCurrentQuery();
 
+    bool hasInsertionTable() { return !insertion_table.empty(); }
     void setInsertionTable(StorageID db_and_table) { insertion_table = std::move(db_and_table); }
     const StorageID & getInsertionTable() const { return insertion_table; }
 
diff --git a/src/Processors/Formats/IInputFormat.cpp b/src/Processors/Formats/IInputFormat.cpp
index f5cba35206f..674a4affc46 100644
--- a/src/Processors/Formats/IInputFormat.cpp
+++ b/src/Processors/Formats/IInputFormat.cpp
@@ -1,93 +1,10 @@
 #include <Processors/Formats/IInputFormat.h>
-#include <Processors/Formats/Impl/CSVRowOutputFormat.h>
 #include <IO/ReadBuffer.h>
-#include <IO/WriteHelpers.h>
-#include <DataTypes/DataTypeString.h>
-#include <DataTypes/DataTypesNumber.h>
 
 
 namespace DB
 {
 
-InputFormatErrorsLogger::InputFormatErrorsLogger(
-    Context::ApplicationType app_type,
-    const String & user_files_path,
-    String & path_in_setting,
-    bool is_changed,
-    FormatSettings::ErrorsOutputFormat output_format,
-    const String & database_,
-    const String & table_)
-    : database(database_), table(table_)
-{
-    if (app_type == Context::ApplicationType::SERVER)
-    {
-        trimLeft(path_in_setting, '/');
-    }
-    else if (!is_changed)
-    {
-        path_in_setting = "/tmp/" + path_in_setting;
-    }
-    errors_file_path = user_files_path + path_in_setting;
-    if (is_changed)
-    {
-        while (fs::exists(errors_file_path))
-        {
-            errors_file_path += "_new";
-        }
-    }
-    write_buf = std::make_shared<WriteBufferFromFile>(errors_file_path);
-
-    Block header{
-        {std::make_shared<DataTypeString>(), "time"},
-        {std::make_shared<DataTypeString>(), "database"},
-        {std::make_shared<DataTypeString>(), "table"},
-        {std::make_shared<DataTypeUInt32>(), "offset"},
-        {std::make_shared<DataTypeString>(), "reason"},
-        {std::make_shared<DataTypeString>(), "raw_data"}};
-    FormatSettings format_settings;
-    RowOutputFormatParams out_params;
-
-    if (output_format == FormatSettings::ErrorsOutputFormat::CSV)
-        writer = std::make_shared<CSVRowOutputFormat>(*write_buf, header, false, false, out_params, format_settings);
-}
-
-void InputFormatErrorsLogger::logErrorImpl(ErrorEntry entry)
-{
-    for (auto & ch : entry.reason)
-    {
-        if (ch == '\"')
-            ch = '\'';
-    }
-    for (auto & ch : entry.raw_data)
-    {
-        if (ch == '\"')
-            ch = '\'';
-    }
-    Block error{
-        {DataTypeString().createColumnConst(1, entry.time)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "time"},
-        {DataTypeString().createColumnConst(1, database)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "database"},
-        {DataTypeString().createColumnConst(1, table)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "table"},
-        {DataTypeUInt32().createColumnConst(1, entry.offset)->convertToFullColumnIfConst(), std::make_shared<DataTypeUInt32>(), "offset"},
-        {DataTypeString().createColumnConst(1, entry.reason)->convertToFullColumnIfConst(), std::make_shared<DataTypeString>(), "reason"},
-        {DataTypeString().createColumnConst(1, entry.raw_data)->convertToFullColumnIfConst(),
-         std::make_shared<DataTypeString>(),
-         "raw_data"}};
-    writer->write(error);
-}
-
-void InputFormatErrorsLogger::logError(ErrorEntry entry)
-{
-    logErrorImpl(entry);
-}
-
-ParallelInputFormatErrorsLogger::~ParallelInputFormatErrorsLogger() = default;
-
-void ParallelInputFormatErrorsLogger::logError(ErrorEntry entry)
-{
-    std::lock_guard lock(write_mutex);
-    logErrorImpl(entry);
-}
-
 IInputFormat::IInputFormat(Block header, ReadBuffer & in_)
     : ISource(std::move(header)), in(&in_)
 {
diff --git a/src/Processors/Formats/IInputFormat.h b/src/Processors/Formats/IInputFormat.h
index 198d37688df..091447e96ee 100644
--- a/src/Processors/Formats/IInputFormat.h
+++ b/src/Processors/Formats/IInputFormat.h
@@ -1,8 +1,8 @@
 #pragma once
 
+#include <Processors/Formats/InputFormatErrorsLogger.h>
 #include <Processors/ISource.h>
 #include <IO/ReadBuffer.h>
-#include <IO/WriteBufferFromFile.h>
 #include <Interpreters/Context.h>
 #include <Formats/ColumnMapping.h>
 
@@ -12,65 +12,6 @@ namespace DB
 
 using ColumnMappingPtr = std::shared_ptr<ColumnMapping>;
 
-class InputFormatErrorsLogger
-{
-public:
-    struct ErrorEntry
-    {
-        String time;
-        size_t offset;
-        String reason;
-        String raw_data;
-    };
-
-    InputFormatErrorsLogger(
-        Context::ApplicationType app_type,
-        const String & user_files_path,
-        String & path_in_setting,
-        bool is_changed,
-        FormatSettings::ErrorsOutputFormat output_format,
-        const String & database_,
-        const String & table_);
-
-    virtual ~InputFormatErrorsLogger() = default;
-
-    virtual void logError(ErrorEntry entry);
-    void logErrorImpl(ErrorEntry entry);
-
-private:
-    String errors_file_path;
-    std::shared_ptr<WriteBufferFromFile> write_buf;
-    OutputFormatPtr writer;
-
-    String database;
-    String table;
-};
-
-using InputFormatErrorsLoggerPtr = std::shared_ptr<InputFormatErrorsLogger>;
-
-class ParallelInputFormatErrorsLogger : public InputFormatErrorsLogger
-{
-public:
-    ParallelInputFormatErrorsLogger(
-        Context::ApplicationType app_type,
-        const String & user_files_path,
-        String & path_in_setting,
-        bool is_changed,
-        FormatSettings::ErrorsOutputFormat output_format,
-        const String & database_,
-        const String & table_)
-        : InputFormatErrorsLogger(app_type, user_files_path, path_in_setting, is_changed, output_format, database_, table_)
-    {
-    }
-
-    ~ParallelInputFormatErrorsLogger() override;
-
-    void logError(ErrorEntry entry) override;
-
-private:
-    std::mutex write_mutex;
-};
-
 /** Input format is a source, that reads data from ReadBuffer.
   */
 class IInputFormat : public ISource
diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp
index 49b94a255e3..b3882b43570 100644
--- a/src/Processors/Formats/IRowInputFormat.cpp
+++ b/src/Processors/Formats/IRowInputFormat.cpp
@@ -54,6 +54,31 @@ IRowInputFormat::IRowInputFormat(Block header, ReadBuffer & in_, Params params_)
 {
 }
 
+void IRowInputFormat::logError()
+{
+    String diagnostic;
+    String raw_data;
+    try
+    {
+        std::tie(diagnostic, raw_data) = getDiagnosticAndRawData();
+    }
+    catch (const Exception & exception)
+    {
+        diagnostic = "Cannot get diagnostic: " + exception.message();
+        raw_data = "Cannot get raw data: " + exception.message();
+    }
+    catch (...)
+    {
+        /// Error while trying to obtain verbose diagnostic. Ok to ignore.
+    }
+    trimLeft(diagnostic, '\n');
+    trimRight(diagnostic, '\n');
+
+    auto now_time = time(nullptr);
+
+    errors_logger->logError(InputFormatErrorsLogger::ErrorEntry{now_time, total_rows, diagnostic, raw_data});
+}
+
 Chunk IRowInputFormat::generate()
 {
     if (total_rows == 0)
@@ -106,30 +131,6 @@ Chunk IRowInputFormat::generate()
             }
             catch (Exception & e)
             {
-                /// Record error info for this row
-                String diagnostic;
-                String raw_data;
-                try
-                {
-                    std::tie(diagnostic, raw_data) = getDiagnosticAndRawData();
-                }
-                catch (const Exception & exception)
-                {
-                    diagnostic = "Cannot get diagnostic: " + exception.message();
-                    raw_data = "Cannot get raw data: " + exception.message();
-                }
-                catch (...)
-                {
-                    /// Error while trying to obtain verbose diagnostic. Ok to ignore.
-                }
-                trimLeft(diagnostic, '\n');
-                trimRight(diagnostic, '\n');
-
-                auto now_time = time(nullptr);
-
-                if (errors_logger)
-                    errors_logger->logError(InputFormatErrorsLogger::ErrorEntry{to_string(now_time), total_rows, diagnostic, raw_data});
-
                 /// Logic for possible skipping of errors.
 
                 if (!isParseError(e.code()))
@@ -138,6 +139,9 @@ Chunk IRowInputFormat::generate()
                 if (params.allow_errors_num == 0 && params.allow_errors_ratio == 0)
                     throw;
 
+                if (errors_logger)
+                    logError();
+
                 ++num_errors;
                 Float64 current_error_ratio = static_cast<Float64>(num_errors) / total_rows;
 
diff --git a/src/Processors/Formats/IRowInputFormat.h b/src/Processors/Formats/IRowInputFormat.h
index 7ef766c1d85..a11462549ff 100644
--- a/src/Processors/Formats/IRowInputFormat.h
+++ b/src/Processors/Formats/IRowInputFormat.h
@@ -66,7 +66,9 @@ protected:
     /// If not implemented, returns empty string.
     virtual std::string getDiagnosticInfo() { return {}; }
     /// Get diagnostic info and raw data for a row
-    virtual std::tuple<std::string, std::string> getDiagnosticAndRawData() { return std::make_tuple("", ""); }
+    virtual std::pair<std::string, std::string> getDiagnosticAndRawData() { return std::make_pair("", ""); }
+
+    void logError();
 
     const BlockMissingValues & getMissingValues() const override { return block_missing_values; }
 
diff --git a/src/Processors/Formats/InputFormatErrorsLogger.cpp b/src/Processors/Formats/InputFormatErrorsLogger.cpp
new file mode 100644
index 00000000000..4280a16e101
--- /dev/null
+++ b/src/Processors/Formats/InputFormatErrorsLogger.cpp
@@ -0,0 +1,81 @@
+#include <Processors/Formats/InputFormatErrorsLogger.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/WriteHelpers.h>
+#include <Processors/Formats/IRowOutputFormat.h>
+
+
+namespace DB
+{
+
+namespace
+{
+    const String DEFAULT_OUTPUT_FORMAT = "CSV";
+}
+
+InputFormatErrorsLogger::InputFormatErrorsLogger(const ContextPtr & context)
+{
+    String output_format = context->getSettingsRef().errors_output_format;
+    if (!FormatFactory::instance().isOutputFormat(output_format))
+        output_format = DEFAULT_OUTPUT_FORMAT;
+    try
+    {
+        table = context->getInsertionTable().getTableName();
+        database = context->getInsertionTable().getDatabaseName();
+    }
+    catch (...)
+    {
+        /// Ignore
+    }
+
+    String path_in_setting = context->getSettingsRef().input_format_record_errors_file_path;
+    errors_file_path = context->getApplicationType() == Context::ApplicationType::SERVER ? context->getUserFilesPath() + path_in_setting
+                                                                                         : path_in_setting;
+    while (fs::exists(errors_file_path))
+    {
+        errors_file_path += "_new";
+    }
+    write_buf = std::make_shared<WriteBufferFromFile>(errors_file_path);
+
+    header = Block{
+        {std::make_shared<DataTypeDateTime>(), "time"},
+        {std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>()), "database"},
+        {std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>()), "table"},
+        {std::make_shared<DataTypeUInt32>(), "offset"},
+        {std::make_shared<DataTypeString>(), "reason"},
+        {std::make_shared<DataTypeString>(), "raw_data"}};
+
+    writer = context->getOutputFormat(output_format, *write_buf, header);
+}
+
+void InputFormatErrorsLogger::logErrorImpl(ErrorEntry entry)
+{
+    auto error = header.cloneEmpty();
+    auto columns = error.mutateColumns();
+    columns[0]->insert(entry.time);
+    database.empty() ? columns[1]->insert(Null()) : columns[1]->insert(database);
+    table.empty() ? columns[2]->insert(Null()) : columns[2]->insert(table);
+    columns[3]->insert(entry.offset);
+    columns[4]->insert(entry.reason);
+    columns[5]->insert(entry.raw_data);
+    error.setColumns(std::move(columns));
+
+    writer->write(error);
+}
+
+void InputFormatErrorsLogger::logError(ErrorEntry entry)
+{
+    logErrorImpl(entry);
+}
+
+ParallelInputFormatErrorsLogger::~ParallelInputFormatErrorsLogger() = default;
+
+void ParallelInputFormatErrorsLogger::logError(ErrorEntry entry)
+{
+    std::lock_guard lock(write_mutex);
+    logErrorImpl(entry);
+}
+
+}
diff --git a/src/Processors/Formats/InputFormatErrorsLogger.h b/src/Processors/Formats/InputFormatErrorsLogger.h
new file mode 100644
index 00000000000..20cfb5da133
--- /dev/null
+++ b/src/Processors/Formats/InputFormatErrorsLogger.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <IO/WriteBufferFromFile.h>
+#include <Interpreters/Context.h>
+
+
+namespace DB
+{
+
+class InputFormatErrorsLogger
+{
+public:
+    struct ErrorEntry
+    {
+        time_t time;
+        size_t offset;
+        String reason;
+        String raw_data;
+    };
+
+    InputFormatErrorsLogger(const ContextPtr & context);
+
+    virtual ~InputFormatErrorsLogger() = default;
+
+    virtual void logError(ErrorEntry entry);
+    void logErrorImpl(ErrorEntry entry);
+
+private:
+    Block header;
+
+    String errors_file_path;
+    std::shared_ptr<WriteBufferFromFile> write_buf;
+    OutputFormatPtr writer;
+
+    String database;
+    String table;
+};
+
+using InputFormatErrorsLoggerPtr = std::shared_ptr<InputFormatErrorsLogger>;
+
+class ParallelInputFormatErrorsLogger : public InputFormatErrorsLogger
+{
+public:
+    ParallelInputFormatErrorsLogger(const ContextPtr & context) : InputFormatErrorsLogger(context) { }
+
+    ~ParallelInputFormatErrorsLogger() override;
+
+    void logError(ErrorEntry entry) override;
+
+private:
+    std::mutex write_mutex;
+};
+
+}
diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
index 31e2175a1be..35a86bc476d 100644
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.cpp
@@ -35,13 +35,13 @@ void RowInputFormatWithDiagnosticInfo::updateDiagnosticInfo()
     offset_of_current_row = in->offset();
 }
 
-std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawDataImpl(bool is_errors_record)
+std::pair<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawDataImpl(bool is_errors_record)
 {
     WriteBufferFromOwnString out_diag;
     WriteBufferFromOwnString out_data;
 
     if (in->eof())
-        return std::make_tuple(
+        return std::make_pair(
             "Buffer has gone, cannot extract information about what has been parsed.",
             "Buffer has gone, cannot extract information about what has been parsed.");
 
@@ -54,7 +54,7 @@ std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRaw
     {
         out_diag << "Could not print diagnostic info because two last rows aren't in buffer (rare case)\n";
         out_data << "Could not collect raw data because two last rows aren't in buffer (rare case)\n";
-        return std::make_tuple(out_diag.str(), out_data.str());
+        return std::make_pair(out_diag.str(), out_data.str());
     }
 
     max_length_of_column_name = 0;
@@ -75,7 +75,7 @@ std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRaw
 
         out_diag << "\nRow " << (row_num - 1) << ":\n";
         if (!parseRowAndPrintDiagnosticInfo(columns, out_diag))
-            return std::make_tuple(out_diag.str(), out_data.str());
+            return std::make_pair(out_diag.str(), out_data.str());
     }
     else
     {
@@ -83,14 +83,14 @@ std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRaw
         {
             out_diag << "Could not print diagnostic info because parsing of data hasn't started.\n";
             out_data << "Could not collect raw data because parsing of data hasn't started.\n";
-            return std::make_tuple(out_diag.str(), out_data.str());
+            return std::make_pair(out_diag.str(), out_data.str());
         }
 
         in->position() = in->buffer().begin() + offset_of_current_row;
     }
 
     char * data = in->position();
-    while (*data != '\n' && *data != '\r' && *data != '\0' && data < in->buffer().end())
+    while (data < in->buffer().end() && *data != '\n' && *data != '\r' && *data != '\0')
     {
         out_data << *data;
         ++data;
@@ -100,7 +100,7 @@ std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRaw
     parseRowAndPrintDiagnosticInfo(columns, out_diag);
     out_diag << "\n";
 
-    return std::make_tuple(out_diag.str(), out_data.str());
+    return std::make_pair(out_diag.str(), out_data.str());
 }
 
 String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
@@ -109,7 +109,7 @@ String RowInputFormatWithDiagnosticInfo::getDiagnosticInfo()
     return std::get<0>(diagnostic_and_raw_data);
 }
 
-std::tuple<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawData()
+std::pair<String, String> RowInputFormatWithDiagnosticInfo::getDiagnosticAndRawData()
 {
     return getDiagnosticAndRawDataImpl(true);
 }
diff --git a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
index 97f0a2599fa..49793fcd208 100644
--- a/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
+++ b/src/Processors/Formats/RowInputFormatWithDiagnosticInfo.h
@@ -14,9 +14,9 @@ class RowInputFormatWithDiagnosticInfo : public IRowInputFormat
 public:
     RowInputFormatWithDiagnosticInfo(const Block & header_, ReadBuffer & in_, const Params & params_);
 
-    std::tuple<String, String> getDiagnosticAndRawDataImpl(bool is_errors_record);
+    std::pair<String, String> getDiagnosticAndRawDataImpl(bool is_errors_record);
     String getDiagnosticInfo() override;
-    std::tuple<String, String> getDiagnosticAndRawData() override;
+    std::pair<String, String> getDiagnosticAndRawData() override;
 
     void resetParser() override;
 

From 237abffdba538c9a4acc85db4b15dcdba4e735ac Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Fri, 9 Sep 2022 11:59:53 +0800
Subject: [PATCH 20/87] Improve test

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 ...etry_insert_on_distributed_table.reference |  8 +-
 ...entelemetry_insert_on_distributed_table.sh | 92 ++++++++++++++-----
 2 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
index fac9fabce8a..98fb6a68656 100644
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
@@ -1,4 +1,4 @@
-{'clickhouse.shard_num':'1','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
-{'clickhouse.shard_num':'2','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
-{'clickhouse.shard_num':'1','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
-{'clickhouse.shard_num':'2','clickhouse.cluster':'test_cluster_two_shards','clickhouse.distributed':'default.dist_opentelemetry','clickhouse.remote':'default.local_opentelemetry','clickhouse.rows':'1','clickhouse.bytes':'8'}
+1
+1
+1
+1
diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
index 319f0151b1d..7ec0650aaac 100755
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -11,60 +11,104 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 
 
 ${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
-SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
-TRUNCATE TABLE IF EXISTS system.opentelemetry_span_log ON CLUSTER test_cluster_two_shards;
+DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.dist_opentelemetry;
+DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.local_opentelemetry;
 
-DROP TABLE IF EXISTS default.dist_opentelemetry ON CLUSTER test_cluster_two_shards;
-DROP TABLE IF EXISTS default.local_opentelemetry ON CLUSTER test_cluster_two_shards;
-
-CREATE TABLE default.dist_opentelemetry  ON CLUSTER test_cluster_two_shards (key UInt64) Engine=Distributed('test_cluster_two_shards', default, local_opentelemetry, key % 2);
-CREATE TABLE default.local_opentelemetry ON CLUSTER test_cluster_two_shards (key UInt64) Engine=MergeTree ORDER BY key;
+CREATE TABLE ${CLICKHOUSE_DATABASE}.dist_opentelemetry  (key UInt64) Engine=Distributed('test_cluster_two_shards', ${CLICKHOUSE_DATABASE}, local_opentelemetry, key % 2);
+CREATE TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry (key UInt64) Engine=MergeTree ORDER BY key;
 "
 
 #
 # INSERT ASYNC test
 # Do test with opentelemetry enabled
 #
-${CLICKHOUSE_CLIENT} -nq "
-INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1, insert_distributed_sync=0 VALUES(1),(2);
-"
+trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
+echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=0 VALUES(1),(2)" |
+${CLICKHOUSE_CURL} \
+    -X POST \
+    -H "traceparent: 00-$trace_id-5250000000000525-01" \
+    -H "tracestate: some custom state" \
+    "${CLICKHOUSE_URL}" \
+    --data @-
 
 # Check log
 ${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
 -- Make sure INSERT on distributed finishes
-SYSTEM FLUSH DISTRIBUTED default.dist_opentelemetry ON CLUSTER test_cluster_two_shards;
+SYSTEM FLUSH DISTRIBUTED ${CLICKHOUSE_DATABASE}.dist_opentelemetry;
 
 -- Make sure opentelemetry span log flushed
-SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
+SYSTEM FLUSH LOGS;
 
 -- Above INSERT will insert data to two shards respectively, so there will be two spans generated
-SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%writeToLocal%';
-SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%processFile%';
+SELECT count() FROM system.opentelemetry_span_log
+WHERE lower(hex(trace_id)) = '${trace_id}'
+AND   operation_name like '%writeToLocal%'
+AND   attribute['clickhouse.shard_num']   = '1'
+AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
+AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
+AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
+AND   attribute['clickhouse.rows']        = '1'
+AND   attribute['clickhouse.bytes']       = '8'
+;
+
+SELECT count() FROM system.opentelemetry_span_log
+WHERE lower(hex(trace_id)) = '${trace_id}'
+AND   operation_name like '%writeToLocal%'
+AND   attribute['clickhouse.shard_num']   = '2'
+AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
+AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
+AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
+AND   attribute['clickhouse.rows']        = '1'
+AND   attribute['clickhouse.bytes']       = '8'
+;
+
 "
 
 #
 # INSERT SYNC test
 # Do test with opentelemetry enabled and in SYNC mode
 #
-${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
--- Clear log
-TRUNCATE TABLE IF EXISTS system.opentelemetry_span_log ON CLUSTER test_cluster_two_shards;
-
-INSERT INTO default.dist_opentelemetry SETTINGS opentelemetry_start_trace_probability=1, insert_distributed_sync=1 VALUES(1),(2);
-"
+trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
+echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=1 VALUES(1),(2)" |
+${CLICKHOUSE_CURL} \
+    -X POST \
+    -H "traceparent: 00-$trace_id-5250000000000525-01" \
+    -H "tracestate: some custom state" \
+    "${CLICKHOUSE_URL}" \
+    --data @-
 
 # Check log
 ${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
-SYSTEM FLUSH LOGS ON CLUSTER test_cluster_two_shards;
+SYSTEM FLUSH LOGS;
 
 -- Above INSERT will insert data to two shards in the same flow, so there should be two spans generated with the same operation name
-SELECT attribute FROM cluster('test_cluster_two_shards', system, opentelemetry_span_log) WHERE operation_name like '%runWritingJob%';
+SELECT count() FROM system.opentelemetry_span_log
+WHERE lower(hex(trace_id)) = '${trace_id}'
+AND   operation_name like '%runWritingJob%'
+AND   attribute['clickhouse.shard_num']   = '1'
+AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
+AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
+AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
+AND   attribute['clickhouse.rows']        = '1'
+AND   attribute['clickhouse.bytes']       = '8'
+;
+
+SELECT count() FROM system.opentelemetry_span_log
+WHERE lower(hex(trace_id)) = '${trace_id}'
+AND   operation_name like '%runWritingJob%'
+AND   attribute['clickhouse.shard_num']   = '2'
+AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
+AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
+AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
+AND   attribute['clickhouse.rows']        = '1'
+AND   attribute['clickhouse.bytes']       = '8'
+;
 "
 
 #
 # Cleanup
 #
 ${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
-DROP TABLE default.dist_opentelemetry  ON CLUSTER test_cluster_two_shards;
-DROP TABLE default.local_opentelemetry ON CLUSTER test_cluster_two_shards;
+DROP TABLE ${CLICKHOUSE_DATABASE}.dist_opentelemetry;
+DROP TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry;
 "

From 92a92baa33842f6a1aeee232f22a17b4fd71e923 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Fri, 9 Sep 2022 12:18:27 +0800
Subject: [PATCH 21/87] Simplify test

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 ...entelemetry_insert_on_distributed_table.sh | 126 ++++++------------
 1 file changed, 42 insertions(+), 84 deletions(-)

diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
index 7ec0650aaac..1b4e1da97f9 100755
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -1,16 +1,44 @@
 #!/usr/bin/env bash
 # Tags: no-fasttest, distributed
 
-set -ue
-
-unset CLICKHOUSE_LOG_COMMENT
-
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
 
 
-${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
+function insert()
+{
+    echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=$2 VALUES(1),(2)" |
+        ${CLICKHOUSE_CURL} \
+            -X POST \
+            -H "traceparent: 00-$1-5150000000000515-01" \
+            -H "tracestate: some custom state" \
+            "${CLICKHOUSE_URL}" \
+            --data @-
+}
+
+function check_span()
+{
+${CLICKHOUSE_CLIENT} -nq "
+    SYSTEM FLUSH LOGS;
+
+    SELECT count() FROM system.opentelemetry_span_log
+    WHERE lower(hex(trace_id))                = '${1}'
+    AND   operation_name                      like '${2}'
+    AND   attribute['clickhouse.shard_num']   = '${3}'
+    AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
+    AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
+    AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
+    AND   attribute['clickhouse.rows']        = '1'
+    AND   attribute['clickhouse.bytes']       = '8'
+    ;"
+}
+
+
+#
+# Prepare tables for tests
+#
+${CLICKHOUSE_CLIENT} -nq "
 DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.dist_opentelemetry;
 DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.local_opentelemetry;
 
@@ -19,96 +47,26 @@ CREATE TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry (key UInt64) Engine=Merg
 "
 
 #
-# INSERT ASYNC test
-# Do test with opentelemetry enabled
+# ASYNC INSERT test with opentelemetry enabled
 #
 trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
-echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=0 VALUES(1),(2)" |
-${CLICKHOUSE_CURL} \
-    -X POST \
-    -H "traceparent: 00-$trace_id-5250000000000525-01" \
-    -H "tracestate: some custom state" \
-    "${CLICKHOUSE_URL}" \
-    --data @-
+insert $trace_id 0
+check_span $trace_id '%writeToLocal%' '1'
+check_span $trace_id '%writeToLocal%' '2'
 
-# Check log
-${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
--- Make sure INSERT on distributed finishes
-SYSTEM FLUSH DISTRIBUTED ${CLICKHOUSE_DATABASE}.dist_opentelemetry;
-
--- Make sure opentelemetry span log flushed
-SYSTEM FLUSH LOGS;
-
--- Above INSERT will insert data to two shards respectively, so there will be two spans generated
-SELECT count() FROM system.opentelemetry_span_log
-WHERE lower(hex(trace_id)) = '${trace_id}'
-AND   operation_name like '%writeToLocal%'
-AND   attribute['clickhouse.shard_num']   = '1'
-AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
-AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
-AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
-AND   attribute['clickhouse.rows']        = '1'
-AND   attribute['clickhouse.bytes']       = '8'
-;
-
-SELECT count() FROM system.opentelemetry_span_log
-WHERE lower(hex(trace_id)) = '${trace_id}'
-AND   operation_name like '%writeToLocal%'
-AND   attribute['clickhouse.shard_num']   = '2'
-AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
-AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
-AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
-AND   attribute['clickhouse.rows']        = '1'
-AND   attribute['clickhouse.bytes']       = '8'
-;
-
-"
 
 #
-# INSERT SYNC test
-# Do test with opentelemetry enabled and in SYNC mode
+# SYNC INSERT SYNC test with opentelemetry enabled
 #
 trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
-echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=1 VALUES(1),(2)" |
-${CLICKHOUSE_CURL} \
-    -X POST \
-    -H "traceparent: 00-$trace_id-5250000000000525-01" \
-    -H "tracestate: some custom state" \
-    "${CLICKHOUSE_URL}" \
-    --data @-
-
-# Check log
-${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
-SYSTEM FLUSH LOGS;
-
--- Above INSERT will insert data to two shards in the same flow, so there should be two spans generated with the same operation name
-SELECT count() FROM system.opentelemetry_span_log
-WHERE lower(hex(trace_id)) = '${trace_id}'
-AND   operation_name like '%runWritingJob%'
-AND   attribute['clickhouse.shard_num']   = '1'
-AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
-AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
-AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
-AND   attribute['clickhouse.rows']        = '1'
-AND   attribute['clickhouse.bytes']       = '8'
-;
-
-SELECT count() FROM system.opentelemetry_span_log
-WHERE lower(hex(trace_id)) = '${trace_id}'
-AND   operation_name like '%runWritingJob%'
-AND   attribute['clickhouse.shard_num']   = '2'
-AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
-AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
-AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
-AND   attribute['clickhouse.rows']        = '1'
-AND   attribute['clickhouse.bytes']       = '8'
-;
-"
+insert $trace_id 1
+check_span $trace_id '%runWritingJob%' '1'
+check_span $trace_id '%runWritingJob%' '2'
 
 #
 # Cleanup
 #
-${CLICKHOUSE_CLIENT} --distributed_ddl_output_mode=none -nq "
+${CLICKHOUSE_CLIENT} -nq "
 DROP TABLE ${CLICKHOUSE_DATABASE}.dist_opentelemetry;
 DROP TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry;
 "

From 2fb0ae7002938720751fe606c3725a84dfbcad88 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Fri, 9 Sep 2022 19:02:42 +0800
Subject: [PATCH 22/87] Update test case

---
 .../02417_opentelemetry_insert_on_distributed_table.sh          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
index 1b4e1da97f9..55457d26249 100755
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -52,7 +52,7 @@ CREATE TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry (key UInt64) Engine=Merg
 trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
 insert $trace_id 0
 check_span $trace_id '%writeToLocal%' '1'
-check_span $trace_id '%writeToLocal%' '2'
+check_span $trace_id '%processFile%'  '2'
 
 
 #

From bd9fabc3f7a312fad331739e689ef1ab9d1c8bc9 Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Fri, 9 Sep 2022 23:21:37 +0800
Subject: [PATCH 23/87] code optimization, add test

---
 src/Formats/FormatFactory.cpp                 |  4 +--
 src/Interpreters/StorageID.h                  |  2 ++
 .../Formats/InputFormatErrorsLogger.cpp       | 13 +++-----
 ...ecord_errors_row_by_input_format.reference |  6 ++++
 ...02421_record_errors_row_by_input_format.sh | 32 +++++++++++++++++++
 5 files changed, 46 insertions(+), 11 deletions(-)
 create mode 100644 tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference
 create mode 100644 tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh

diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 48d95a1973d..01a3811728f 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -244,7 +244,7 @@ InputFormatPtr FormatFactory::getInput(
             buf, sample, parser_creator, file_segmentation_engine, name, settings.max_threads, settings.min_chunk_bytes_for_parallel_parsing,
                context->getApplicationType() == Context::ApplicationType::SERVER};
         auto format = std::make_shared<ParallelParsingInputFormat>(params);
-        if (!context->getSettingsRef().input_format_record_errors_file_path.toString().empty())
+        if (!settings.input_format_record_errors_file_path.toString().empty())
         {
             format->setErrorsLogger(std::make_shared<ParallelInputFormatErrorsLogger>(context));
         }
@@ -253,7 +253,7 @@ InputFormatPtr FormatFactory::getInput(
 
 
     auto format = getInputFormat(name, buf, sample, context, max_block_size, format_settings);
-    if (!context->getSettingsRef().input_format_record_errors_file_path.toString().empty())
+    if (!settings.input_format_record_errors_file_path.toString().empty())
     {
         format->setErrorsLogger(std::make_shared<InputFormatErrorsLogger>(context));
     }
diff --git a/src/Interpreters/StorageID.h b/src/Interpreters/StorageID.h
index c60c3aec9c6..43710988243 100644
--- a/src/Interpreters/StorageID.h
+++ b/src/Interpreters/StorageID.h
@@ -69,6 +69,8 @@ struct StorageID
         return uuid != UUIDHelpers::Nil;
     }
 
+    bool hasDatabase() const { return !database_name.empty(); }
+
     bool operator<(const StorageID & rhs) const;
     bool operator==(const StorageID & rhs) const;
 
diff --git a/src/Processors/Formats/InputFormatErrorsLogger.cpp b/src/Processors/Formats/InputFormatErrorsLogger.cpp
index 4280a16e101..24f526a9b50 100644
--- a/src/Processors/Formats/InputFormatErrorsLogger.cpp
+++ b/src/Processors/Formats/InputFormatErrorsLogger.cpp
@@ -20,15 +20,10 @@ InputFormatErrorsLogger::InputFormatErrorsLogger(const ContextPtr & context)
     String output_format = context->getSettingsRef().errors_output_format;
     if (!FormatFactory::instance().isOutputFormat(output_format))
         output_format = DEFAULT_OUTPUT_FORMAT;
-    try
-    {
+    if (context->hasInsertionTable())
         table = context->getInsertionTable().getTableName();
+    if (context->getInsertionTable().hasDatabase())
         database = context->getInsertionTable().getDatabaseName();
-    }
-    catch (...)
-    {
-        /// Ignore
-    }
 
     String path_in_setting = context->getSettingsRef().input_format_record_errors_file_path;
     errors_file_path = context->getApplicationType() == Context::ApplicationType::SERVER ? context->getUserFilesPath() + path_in_setting
@@ -55,8 +50,8 @@ void InputFormatErrorsLogger::logErrorImpl(ErrorEntry entry)
     auto error = header.cloneEmpty();
     auto columns = error.mutateColumns();
     columns[0]->insert(entry.time);
-    database.empty() ? columns[1]->insert(Null()) : columns[1]->insert(database);
-    table.empty() ? columns[2]->insert(Null()) : columns[2]->insert(table);
+    database.empty() ? columns[1]->insertDefault() : columns[1]->insert(database);
+    table.empty() ? columns[2]->insertDefault() : columns[2]->insert(table);
     columns[3]->insert(entry.offset);
     columns[4]->insert(entry.reason);
     columns[5]->insert(entry.raw_data);
diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference
new file mode 100644
index 00000000000..570c8033cbd
--- /dev/null
+++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference
@@ -0,0 +1,6 @@
+default	data	2	Row 2:\nColumn 0,   name: c1, type: UInt8, parsed text: "2"\nColumn 1,   name: c2, type: UInt8, ERROR: text "a<LINE FEED>b,3<LINE FEED>4,4<LINE FEED>" is not like UInt8	2,a
+default	data	3	Row 3:\nColumn 0,   name: c1, type: UInt8, ERROR: text "b,3<LINE FEED>4,4<LINE FEED>5," is not like UInt8	b,3
+default	data	5	Row 5:\nColumn 0,   name: c1, type: UInt8, parsed text: "5"\nColumn 1,   name: c2, type: UInt8, ERROR: text "c<LINE FEED>6,6<LINE FEED>" is not like UInt8	5,c
+\N	data	2	Row 2:\nColumn 0,   name: A, type: UInt8, parsed text: "2"\nColumn 1,   name: B, type: UInt8, ERROR: text "a<LINE FEED>b,3<LINE FEED>4,4<LINE FEED>" is not like UInt8	2,a
+\N	data	3	Row 3:\nColumn 0,   name: A, type: UInt8, ERROR: text "b,3<LINE FEED>4,4<LINE FEED>5," is not like UInt8	b,3
+\N	data	5	Row 5:\nColumn 0,   name: A, type: UInt8, parsed text: "5"\nColumn 1,   name: B, type: UInt8, ERROR: text "c<LINE FEED>6,6<LINE FEED>" is not like UInt8	5,c
\ No newline at end of file
diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
new file mode 100644
index 00000000000..b00b4f146a6
--- /dev/null
+++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# Tags: no-parallel
+
+set -eu
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+# Data preparation.
+
+CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
+
+mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/
+echo -e "1,1\n2,a\nb,3\n4,4\n5,c\n6,6" > ${CLICKHOUSE_USER_FILES_PATH}/a.csv
+
+${CLICKHOUSE_CLIENT} --query "drop table if exists data;"
+${CLICKHOUSE_CLIENT} --query "create table data (A UInt8, B UInt8) engine=MergeTree() order by A;"
+
+# Server side
+${CLICKHOUSE_CLIENT} --input_format_allow_errors_num 4 --input_format_record_errors_file_path "errors_server" --query "insert into data select * from file('a.csv', 'CSV', 'c1 UInt8, c2 UInt8');"
+${CLICKHOUSE_CLIENT} --query "select * except (time) from file('errors_server', 'CSV', 'time DateTime, database Nullable(String), table Nullable(String), offset UInt32, reason String, raw_data String');"
+
+# Client side
+${CLICKHOUSE_CLIENT} --input_format_allow_errors_num 4 --input_format_record_errors_file_path "${CLICKHOUSE_USER_FILES_PATH}/errors_client" --query "insert into data(A, B) format CSV" < ${CLICKHOUSE_USER_FILES_PATH}/a.csv
+${CLICKHOUSE_CLIENT} --query "select * except (time) from file('errors_client', 'CSV', 'time DateTime, database Nullable(String), table Nullable(String), offset UInt32, reason String, raw_data String');"
+
+# Restore
+${CLICKHOUSE_CLIENT} --query "drop table if exists data;"
+rm ${CLICKHOUSE_USER_FILES_PATH}/a.csv
+rm ${CLICKHOUSE_USER_FILES_PATH}/errors_server
+rm ${CLICKHOUSE_USER_FILES_PATH}/errors_client

From 877776e569a5f83d685a954ef62877ddbd0f6602 Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Fri, 9 Sep 2022 18:04:29 +0200
Subject: [PATCH 24/87] Update src/Functions/grouping.h

---
 src/Functions/grouping.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Functions/grouping.h b/src/Functions/grouping.h
index 7a9df462b23..b9ef6ffc107 100644
--- a/src/Functions/grouping.h
+++ b/src/Functions/grouping.h
@@ -19,6 +19,8 @@ protected:
     static constexpr UInt64 ONE = 1;
 
     const ColumnNumbers arguments_indexes;
+    // Initial implementation of GROUPING function returned 1 if the argument is used as an aggregation key.
+    // This differs from the behavior described in the standard and other DBMS.
     const bool force_compatibility;
 
     static constexpr UInt64 COMPATIBLE_MODE[]   = {1, 0};

From 51badf3d0ad8d08f9e472506e998448922e8ce34 Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Sat, 10 Sep 2022 00:08:50 +0800
Subject: [PATCH 25/87] fix

---
 src/Interpreters/Context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index bf2660bf4d7..81e082acecc 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -612,7 +612,7 @@ public:
 
     void killCurrentQuery();
 
-    bool hasInsertionTable() { return !insertion_table.empty(); }
+    bool hasInsertionTable() const { return !insertion_table.empty(); }
     void setInsertionTable(StorageID db_and_table) { insertion_table = std::move(db_and_table); }
     const StorageID & getInsertionTable() const { return insertion_table; }
 

From da8e5631eac1db12e273d69ffe2ee90ea4df3ce8 Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Sat, 10 Sep 2022 00:47:34 +0800
Subject: [PATCH 26/87] fix test

---
 .../02421_record_errors_row_by_input_format.reference           | 2 +-
 .../0_stateless/02421_record_errors_row_by_input_format.sh      | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference
index 570c8033cbd..67ec09b70b7 100644
--- a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference
+++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.reference
@@ -3,4 +3,4 @@ default	data	3	Row 3:\nColumn 0,   name: c1, type: UInt8, ERROR: text "b,3<LINE
 default	data	5	Row 5:\nColumn 0,   name: c1, type: UInt8, parsed text: "5"\nColumn 1,   name: c2, type: UInt8, ERROR: text "c<LINE FEED>6,6<LINE FEED>" is not like UInt8	5,c
 \N	data	2	Row 2:\nColumn 0,   name: A, type: UInt8, parsed text: "2"\nColumn 1,   name: B, type: UInt8, ERROR: text "a<LINE FEED>b,3<LINE FEED>4,4<LINE FEED>" is not like UInt8	2,a
 \N	data	3	Row 3:\nColumn 0,   name: A, type: UInt8, ERROR: text "b,3<LINE FEED>4,4<LINE FEED>5," is not like UInt8	b,3
-\N	data	5	Row 5:\nColumn 0,   name: A, type: UInt8, parsed text: "5"\nColumn 1,   name: B, type: UInt8, ERROR: text "c<LINE FEED>6,6<LINE FEED>" is not like UInt8	5,c
\ No newline at end of file
+\N	data	5	Row 5:\nColumn 0,   name: A, type: UInt8, parsed text: "5"\nColumn 1,   name: B, type: UInt8, ERROR: text "c<LINE FEED>6,6<LINE FEED>" is not like UInt8	5,c
diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
index b00b4f146a6..dd782c6ed40 100644
--- a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
+++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
@@ -30,3 +30,4 @@ ${CLICKHOUSE_CLIENT} --query "drop table if exists data;"
 rm ${CLICKHOUSE_USER_FILES_PATH}/a.csv
 rm ${CLICKHOUSE_USER_FILES_PATH}/errors_server
 rm ${CLICKHOUSE_USER_FILES_PATH}/errors_client
+

From 57146c9361fd7ba87f7798f9a001db2a1b1d523d Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 9 Sep 2022 19:43:14 +0200
Subject: [PATCH 27/87] Fix typos in SortedBlocksWriter

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 src/Interpreters/SortedBlocksWriter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/SortedBlocksWriter.cpp b/src/Interpreters/SortedBlocksWriter.cpp
index 0acb056690f..20859e23ea7 100644
--- a/src/Interpreters/SortedBlocksWriter.cpp
+++ b/src/Interpreters/SortedBlocksWriter.cpp
@@ -87,7 +87,7 @@ void SortedBlocksWriter::insert(Block && block)
     {
         std::lock_guard lock{insert_mutex};
 
-        /// insert bock into BlocksList undef lock
+        /// insert block into BlocksList under lock
         inserted_blocks.insert(std::move(block));
 
         size_t total_row_count = inserted_blocks.row_count + row_count_in_flush;

From 763bb18f98ac34521fe342158a44527e3318d58a Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 9 Sep 2022 19:23:45 +0200
Subject: [PATCH 28/87] Fix SIGSEGV in SortedBlocksWriter in case of empty
 block

CI found one issue [1].

Here is the stack trace for invalid read:

<details>

<summary>stack trace</summary>

```
    0: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::__is_long(this="") const at string:1445:22
    1: DB::TemporaryFileLazySource::TemporaryFileLazySource(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block const&) [inlined] std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::basic_string(this="", __str="") at string:1927
    2: DB::TemporaryFileLazySource::TemporaryFileLazySource(this=0x00007f3aec105f58, path_="", header_=0x00007f38ffd93b40) at TemporaryFileLazySource.cpp:11
    3: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] DB::TemporaryFileLazySource* std::__1::construct_at<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, DB::TemporaryFileLazySource*>(__args=0x00007f38ffd91560) at construct_at.h:38:50
    4: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] void std::__1::allocator_traits<std::__1::allocator<DB::TemporaryFileLazySource> >::construct<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void, void>(__args=0x00007f38ffd91560) at allocator_traits.h:298
    5: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::__shared_ptr_emplace<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource> >::__shared_ptr_emplace<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block>(this=0x00007f3aec105f40, __args=0x00007f38ffd91560) at shared_ptr.h:293
    6: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::allocate_shared<DB::TemporaryFileLazySource, std::__1::allocator<DB::TemporaryFileLazySource>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:954
    7: DB::SortedBlocksWriter::streamFromFile(std::__1::unique_ptr<Poco::TemporaryFile, std::__1::default_delete<Poco::TemporaryFile> > const&) const [inlined] std::__1::shared_ptr<DB::TemporaryFileLazySource> std::__1::make_shared<DB::TemporaryFileLazySource, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, DB::Block, void>(__args=<unavailable>, __args=<unavailable>) at shared_ptr.h:963
    8: DB::SortedBlocksWriter::streamFromFile(this=<unavailable>, file=<unavailable>) const at SortedBlocksWriter.cpp:238
    9: DB::SortedBlocksWriter::premerge(this=<unavailable>) at SortedBlocksWriter.cpp:209:32
```

</details>

  [1]: https://s3.amazonaws.com/clickhouse-test-reports/41046/adea92f847373d1fcfd733d8979c63024f9b80bf/stress_test__asan_.html

So the problem here is that there was empty unique_ptr<> reference to
temporary file, because of empty block that accepted by
SortedBlocksWriter::insert(), but insert() is not a problem the problem
is premerge() that steals blocks from insert() and do not have check
that there are some rows. However this check exists in
SortedBlocksWriter::flush(), and in that case temporary file is not
created.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 src/Interpreters/SortedBlocksWriter.cpp       | 10 +++++++++-
 .../0_stateless/02070_join_on_disk.reference  |  0
 .../0_stateless/02070_join_on_disk.sql        | 19 +++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/02070_join_on_disk.reference
 create mode 100644 tests/queries/0_stateless/02070_join_on_disk.sql

diff --git a/src/Interpreters/SortedBlocksWriter.cpp b/src/Interpreters/SortedBlocksWriter.cpp
index 20859e23ea7..755c43df635 100644
--- a/src/Interpreters/SortedBlocksWriter.cpp
+++ b/src/Interpreters/SortedBlocksWriter.cpp
@@ -28,6 +28,11 @@ namespace CurrentMetrics
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
 namespace
 {
 
@@ -84,6 +89,9 @@ void SortedBlocksWriter::insert(Block && block)
     size_t bytes = 0;
     size_t flush_no = 0;
 
+    if (!block.rows())
+        return;
+
     {
         std::lock_guard lock{insert_mutex};
 
@@ -145,7 +153,7 @@ SortedBlocksWriter::TmpFilePtr SortedBlocksWriter::flush(const BlocksList & bloc
             pipes.emplace_back(std::make_shared<SourceFromSingleChunk>(block.cloneEmpty(), Chunk(block.getColumns(), num_rows)));
 
     if (pipes.empty())
-        return {};
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty block");
 
     QueryPipelineBuilder pipeline;
     pipeline.init(Pipe::unitePipes(std::move(pipes)));
diff --git a/tests/queries/0_stateless/02070_join_on_disk.reference b/tests/queries/0_stateless/02070_join_on_disk.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/02070_join_on_disk.sql b/tests/queries/0_stateless/02070_join_on_disk.sql
new file mode 100644
index 00000000000..c25a7a1ffac
--- /dev/null
+++ b/tests/queries/0_stateless/02070_join_on_disk.sql
@@ -0,0 +1,19 @@
+-- Regression test when Join stores data on disk and receive empty block.
+-- Because of this it does not create empty file, while expect it.
+
+SET max_threads = 1;
+SET join_algorithm = 'auto';
+SET max_rows_in_join = 1000;
+SET optimize_aggregation_in_order = 1;
+SET max_block_size = 1000;
+
+DROP TABLE IF EXISTS join_on_disk;
+
+SYSTEM STOP MERGES join_on_disk;
+
+CREATE TABLE join_on_disk (id Int) Engine=MergeTree() ORDER BY id;
+
+INSERT INTO join_on_disk SELECT number as id FROM numbers_mt(50000);
+INSERT INTO join_on_disk SELECT number as id FROM numbers_mt(1000);
+
+SELECT id FROM join_on_disk lhs LEFT JOIN (SELECT id FROM join_on_disk GROUP BY id) rhs USING (id) FORMAT Null;

From b2b5091ffd04164ffcfee7986b0973c31e2fbc3c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 9 Sep 2022 22:01:52 +0300
Subject: [PATCH 29/87] Update 02070_join_on_disk.sql

---
 tests/queries/0_stateless/02070_join_on_disk.sql | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/queries/0_stateless/02070_join_on_disk.sql b/tests/queries/0_stateless/02070_join_on_disk.sql
index c25a7a1ffac..eabf31df25f 100644
--- a/tests/queries/0_stateless/02070_join_on_disk.sql
+++ b/tests/queries/0_stateless/02070_join_on_disk.sql
@@ -17,3 +17,5 @@ INSERT INTO join_on_disk SELECT number as id FROM numbers_mt(50000);
 INSERT INTO join_on_disk SELECT number as id FROM numbers_mt(1000);
 
 SELECT id FROM join_on_disk lhs LEFT JOIN (SELECT id FROM join_on_disk GROUP BY id) rhs USING (id) FORMAT Null;
+
+DROP TABLE join_on_disk;

From b012caf6859d35b5ff55ac81a34a313edc15b49c Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Sat, 10 Sep 2022 13:03:59 +0800
Subject: [PATCH 30/87] empty commit


From 27b6a25473ef898b39baed2d6432778070a1d619 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Mon, 12 Sep 2022 11:04:30 +0800
Subject: [PATCH 31/87] Compare content instead of count for easier problem
 solving

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 ...etry_insert_on_distributed_table.reference |  8 +++----
 ...entelemetry_insert_on_distributed_table.sh | 23 +++++++++----------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
index 98fb6a68656..71bbd6f22ae 100644
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
@@ -1,4 +1,4 @@
-1
-1
-1
-1
+{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards","shard":"1","rows":"1","bytes":"8"}
+{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards","shard":"2","rows":"1","bytes":"8"}
+{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards","shard":"1","rows":"1","bytes":"8"}
+{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards","shard":"2","rows":"1","bytes":"8"}
diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
index 55457d26249..06b35298b89 100755
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -22,15 +22,17 @@ function check_span()
 ${CLICKHOUSE_CLIENT} -nq "
     SYSTEM FLUSH LOGS;
 
-    SELECT count() FROM system.opentelemetry_span_log
-    WHERE lower(hex(trace_id))                = '${1}'
-    AND   operation_name                      like '${2}'
-    AND   attribute['clickhouse.shard_num']   = '${3}'
-    AND   attribute['clickhouse.cluster']     = 'test_cluster_two_shards'
+    SELECT operation_name,
+           attribute['clickhouse.cluster'] AS cluster,
+           attribute['clickhouse.shard_num'] AS shard,
+           attribute['clickhouse.rows'] AS rows,
+           attribute['clickhouse.bytes'] AS bytes
+    FROM system.opentelemetry_span_log
+    WHERE finish_date >= yesterday()
+    AND   lower(hex(trace_id))                = '${1}'
     AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
     AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
-    AND   attribute['clickhouse.rows']        = '1'
-    AND   attribute['clickhouse.bytes']       = '8'
+    Format JSONEachRow
     ;"
 }
 
@@ -51,17 +53,14 @@ CREATE TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry (key UInt64) Engine=Merg
 #
 trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
 insert $trace_id 0
-check_span $trace_id '%writeToLocal%' '1'
-check_span $trace_id '%processFile%'  '2'
-
+check_span $trace_id
 
 #
 # SYNC INSERT SYNC test with opentelemetry enabled
 #
 trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
 insert $trace_id 1
-check_span $trace_id '%runWritingJob%' '1'
-check_span $trace_id '%runWritingJob%' '2'
+check_span $trace_id
 
 #
 # Cleanup

From 16975ff4a834d50499db9bf03977bfbb6115a188 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Mon, 12 Sep 2022 14:18:21 +0800
Subject: [PATCH 32/87] Fix testcase

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 .../02417_opentelemetry_insert_on_distributed_table.reference   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
index 71bbd6f22ae..ee5d97c601b 100644
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
@@ -1,4 +1,4 @@
 {"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards","shard":"1","rows":"1","bytes":"8"}
-{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards","shard":"2","rows":"1","bytes":"8"}
+{"operation_name":"void DB::StorageDistributedDirectoryMonitor::processFile(const std::string &)","cluster":"test_cluster_two_shards","shard":"2","rows":"1","bytes":"8"}
 {"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards","shard":"1","rows":"1","bytes":"8"}
 {"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards","shard":"2","rows":"1","bytes":"8"}

From e985ee335463aa5ca8f2c93c4d996ac6a70132db Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 12 Sep 2022 12:22:48 +0000
Subject: [PATCH 33/87] Close sessions on Keeper shutdown

---
 src/Coordination/CoordinationSettings.h       |  1 +
 src/Coordination/KeeperDispatcher.cpp         | 51 ++++++++++++++++---
 .../{keeper_config.xml => keeper_config1.xml} | 18 +++++--
 .../configs/keeper_config2.xml                | 37 ++++++++++++++
 .../configs/keeper_config3.xml                | 37 ++++++++++++++
 tests/integration/test_keeper_session/test.py | 28 +++++++++-
 6 files changed, 160 insertions(+), 12 deletions(-)
 rename tests/integration/test_keeper_session/configs/{keeper_config.xml => keeper_config1.xml} (67%)
 create mode 100644 tests/integration/test_keeper_session/configs/keeper_config2.xml
 create mode 100644 tests/integration/test_keeper_session/configs/keeper_config3.xml

diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h
index 5247f5d7ec8..c436c1b6635 100644
--- a/src/Coordination/CoordinationSettings.h
+++ b/src/Coordination/CoordinationSettings.h
@@ -30,6 +30,7 @@ struct Settings;
     M(UInt64, snapshot_distance, 100000, "How many log items we have to collect to write new snapshot", 0) \
     M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \
     M(Milliseconds, shutdown_timeout, 5000, "How much time we will wait until RAFT shutdown", 0) \
+    M(Milliseconds, session_shutdown_timeout, 10000, "How much time we will wait until sessions are closed during shutdown", 0) \
     M(Milliseconds, startup_timeout, 180000, "How much time we will wait until RAFT to start.", 0) \
     M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \
     M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \
diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp
index 5b376a03b02..48030ef86d2 100644
--- a/src/Coordination/KeeperDispatcher.cpp
+++ b/src/Coordination/KeeperDispatcher.cpp
@@ -354,9 +354,6 @@ void KeeperDispatcher::shutdown()
                 update_configuration_thread.join();
         }
 
-        if (server)
-            server->shutdown();
-
         KeeperStorage::RequestForSession request_for_session;
 
         /// Set session expired for all pending requests
@@ -368,10 +365,52 @@ void KeeperDispatcher::shutdown()
             setResponse(request_for_session.session_id, response);
         }
 
-        /// Clear all registered sessions
-        std::lock_guard lock(session_to_response_callback_mutex);
-        session_to_response_callback.clear();
+        KeeperStorage::RequestsForSessions close_requests;
+        {
+            /// Clear all registered sessions
+            std::lock_guard lock(session_to_response_callback_mutex);
+
+            if (hasLeader())
+            {
+                // send to leader CLOSE requests for active sessions
+                for (const auto & [session, response] : session_to_response_callback)
+                {
+                    Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
+                    request->xid = Coordination::CLOSE_XID;
+                    KeeperStorage::RequestForSession request_info;
+                    request_info.request = request;
+                    using namespace std::chrono;
+                    request_info.time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
+                    request_info.session_id = session;
+
+                    close_requests.push_back(std::move(request_info));
+                }
+            }
+
+            session_to_response_callback.clear();
+        }
+
+        // if there is no leader, there is no reason to do CLOSE because it's a write request
+        if (hasLeader() && !close_requests.empty())
+        {
+            LOG_INFO(log, "Trying to close {} session(s)", close_requests.size());
+            const auto raft_result = server->putRequestBatch(close_requests);
+            Poco::Event sessions_closing_done;
+            raft_result->when_ready([&](nuraft::cmd_result<nuraft::ptr<nuraft::buffer>> & /*result*/, nuraft::ptr<std::exception> & /*exception*/)
+            {
+                sessions_closing_done.set();
+            });
+
+            auto session_shutdown_timeout = configuration_and_settings->coordination_settings->session_shutdown_timeout.totalMilliseconds();
+            if (!sessions_closing_done.tryWait(session_shutdown_timeout))
+                LOG_WARNING(log, "Failed to close sessions in {}ms. If they are not closed, they will be closed after session timeout.", session_shutdown_timeout);
+        }
+
+        if (server)
+            server->shutdown();
+
         CurrentMetrics::set(CurrentMetrics::KeeperAliveConnections, 0);
+
     }
     catch (...)
     {
diff --git a/tests/integration/test_keeper_session/configs/keeper_config.xml b/tests/integration/test_keeper_session/configs/keeper_config1.xml
similarity index 67%
rename from tests/integration/test_keeper_session/configs/keeper_config.xml
rename to tests/integration/test_keeper_session/configs/keeper_config1.xml
index ed0bb52bd51..fd308fe8a2f 100644
--- a/tests/integration/test_keeper_session/configs/keeper_config.xml
+++ b/tests/integration/test_keeper_session/configs/keeper_config1.xml
@@ -1,4 +1,4 @@
-<yandex>
+<clickhouse>
     <keeper_server>
         <tcp_port>9181</tcp_port>
         <server_id>1</server_id>
@@ -19,9 +19,19 @@
                 <id>1</id>
                 <hostname>node1</hostname>
                 <port>9234</port>
-                <can_become_leader>true</can_become_leader>
-                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>9234</port>
+                <start_as_follower>true</start_as_follower>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>9234</port>
+                <start_as_follower>true</start_as_follower>
             </server>
         </raft_configuration>
     </keeper_server>
-</yandex>
+</clickhouse>
diff --git a/tests/integration/test_keeper_session/configs/keeper_config2.xml b/tests/integration/test_keeper_session/configs/keeper_config2.xml
new file mode 100644
index 00000000000..ad558fbccad
--- /dev/null
+++ b/tests/integration/test_keeper_session/configs/keeper_config2.xml
@@ -0,0 +1,37 @@
+<clickhouse>
+    <keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>2</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+        <snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
+        <four_letter_word_white_list>*</four_letter_word_white_list>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <min_session_timeout_ms>5000</min_session_timeout_ms>
+            <snapshot_distance>75</snapshot_distance>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>9234</port>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>9234</port>
+                <start_as_follower>true</start_as_follower>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>9234</port>
+                <start_as_follower>true</start_as_follower>
+            </server>
+        </raft_configuration>
+    </keeper_server>
+</clickhouse>
diff --git a/tests/integration/test_keeper_session/configs/keeper_config3.xml b/tests/integration/test_keeper_session/configs/keeper_config3.xml
new file mode 100644
index 00000000000..2a21f959816
--- /dev/null
+++ b/tests/integration/test_keeper_session/configs/keeper_config3.xml
@@ -0,0 +1,37 @@
+<clickhouse>
+    <keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>3</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+        <snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
+        <four_letter_word_white_list>*</four_letter_word_white_list>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <min_session_timeout_ms>5000</min_session_timeout_ms>
+            <snapshot_distance>75</snapshot_distance>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>9234</port>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>9234</port>
+                <start_as_follower>true</start_as_follower>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>9234</port>
+                <start_as_follower>true</start_as_follower>
+            </server>
+        </raft_configuration>
+    </keeper_server>
+</clickhouse>
diff --git a/tests/integration/test_keeper_session/test.py b/tests/integration/test_keeper_session/test.py
index 30db4d9548c..bb72a30359d 100644
--- a/tests/integration/test_keeper_session/test.py
+++ b/tests/integration/test_keeper_session/test.py
@@ -10,7 +10,15 @@ from kazoo.client import KazooClient
 
 cluster = ClickHouseCluster(__file__)
 node1 = cluster.add_instance(
-    "node1", main_configs=["configs/keeper_config.xml"], stay_alive=True
+    "node1", main_configs=["configs/keeper_config1.xml"], stay_alive=True
+)
+
+node2 = cluster.add_instance(
+    "node2", main_configs=["configs/keeper_config2.xml"], stay_alive=True
+)
+
+node3 = cluster.add_instance(
+    "node3", main_configs=["configs/keeper_config3.xml"], stay_alive=True
 )
 
 bool_struct = struct.Struct("B")
@@ -61,7 +69,7 @@ def wait_node(node):
 
 
 def wait_nodes():
-    for n in [node1]:
+    for n in [node1, node2, node3]:
         wait_node(n)
 
 
@@ -165,3 +173,19 @@ def test_session_timeout(started_cluster):
 
     negotiated_timeout, _ = handshake(node1.name, session_timeout=20000, session_id=0)
     assert negotiated_timeout == 10000
+
+
+def test_session_close_shutdown(started_cluster):
+    wait_nodes()
+
+    node1_zk = get_fake_zk(node1.name)
+    node2_zk = get_fake_zk(node2.name)
+
+    eph_node = "/test_node"
+    node2_zk.create(eph_node, ephemeral=True)
+    assert node1_zk.exists(eph_node) != None
+
+    # shutdown while session is active
+    node2.stop_clickhouse()
+
+    assert node1_zk.exists(eph_node) == None

From c2dfabe51c7ecbfb32aac7cfe07e577acf04c3dd Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 12 Sep 2022 12:25:39 +0000
Subject: [PATCH 34/87] Use shared_ptr for Event

---
 src/Coordination/KeeperDispatcher.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp
index 48030ef86d2..9684f085f4a 100644
--- a/src/Coordination/KeeperDispatcher.cpp
+++ b/src/Coordination/KeeperDispatcher.cpp
@@ -395,14 +395,14 @@ void KeeperDispatcher::shutdown()
         {
             LOG_INFO(log, "Trying to close {} session(s)", close_requests.size());
             const auto raft_result = server->putRequestBatch(close_requests);
-            Poco::Event sessions_closing_done;
-            raft_result->when_ready([&](nuraft::cmd_result<nuraft::ptr<nuraft::buffer>> & /*result*/, nuraft::ptr<std::exception> & /*exception*/)
+            auto sessions_closing_done = std::make_shared<Poco::Event>();
+            raft_result->when_ready([sessions_closing_done](nuraft::cmd_result<nuraft::ptr<nuraft::buffer>> & /*result*/, nuraft::ptr<std::exception> & /*exception*/)
             {
-                sessions_closing_done.set();
+                sessions_closing_done->set();
             });
 
             auto session_shutdown_timeout = configuration_and_settings->coordination_settings->session_shutdown_timeout.totalMilliseconds();
-            if (!sessions_closing_done.tryWait(session_shutdown_timeout))
+            if (!sessions_closing_done->tryWait(session_shutdown_timeout))
                 LOG_WARNING(log, "Failed to close sessions in {}ms. If they are not closed, they will be closed after session timeout.", session_shutdown_timeout);
         }
 

From ebaa24ecaeedc6ba64fe4334f5a9e10583ecbd8c Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Mon, 12 Sep 2022 22:15:30 +0800
Subject: [PATCH 35/87] Fix flaky tests

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 src/Common/ThreadPool.h                       | 18 +++++++---
 src/Core/BackgroundSchedulePool.cpp           |  6 ++--
 src/Core/BackgroundSchedulePool.h             |  6 ++--
 ...etry_insert_on_distributed_table.reference | 12 ++++---
 ...entelemetry_insert_on_distributed_table.sh | 34 +++++++++++++++----
 5 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index fc5377b3783..45f3455bf8f 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -264,6 +264,18 @@ protected:
     }
 };
 
+/// Schedule jobs/tasks on global thread pool without implicit passing tracing context on current thread to underlying worker as parent tracing context.
+///
+/// If you implement your own job/task scheduling upon global thread pool or schedules a long time running job in a infinite loop way, 
+/// you need to use class, or you need to use ThreadFromGlobalPool below.
+///
+/// See the comments of ThreadPool below to know how it works.
+using ThreadFromGlobalPoolWithoutTracingContext = ThreadFromGlobalPoolImpl<false>;
+
+/// An alias of thread that execute jobs/tasks on global thread pool by implicit passing tracing context on current thread to underlying worker as parent tracing context.
+/// If jobs/tasks are directly scheduled by using APIs of this class, you need to use this class or you need to use class above.
+using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl<true>;
+
 /// Recommended thread pool for the case when multiple thread pools are created and destroyed.
 ///
 /// The template parameter of ThreadFromGlobalPool is set to false to disable tracing context propagation to underlying worker.
@@ -274,9 +286,7 @@ protected:
 /// which means the tracing context initialized at underlying worker level won't be delete for a very long time.
 /// This would cause wrong context for further jobs scheduled in ThreadPool.
 ///
-/// To make sure the tracing context are correctly propagated, we explicitly disable context propagation(including initialization and de-initialization) at underlying worker level.
+/// To make sure the tracing context is correctly propagated, we explicitly disable context propagation(including initialization and de-initialization) at underlying worker level.
 ///
-using ThreadPool = ThreadPoolImpl<ThreadFromGlobalPoolImpl<false>>;
+using ThreadPool = ThreadPoolImpl<ThreadFromGlobalPoolWithoutTracingContext>;
 
-/// An alias for user code to execute a job in the global thread pool
-using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl<true>;
diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp
index b7a33c4930d..b2adc07d92f 100644
--- a/src/Core/BackgroundSchedulePool.cpp
+++ b/src/Core/BackgroundSchedulePool.cpp
@@ -149,9 +149,9 @@ BackgroundSchedulePool::BackgroundSchedulePool(size_t size_, CurrentMetrics::Met
 
     threads.resize(size_);
     for (auto & thread : threads)
-        thread = ThreadFromGlobalPool([this] { threadFunction(); });
+        thread = ThreadFromGlobalPoolWithoutTracingContext([this] { threadFunction(); });
 
-    delayed_thread = ThreadFromGlobalPool([this] { delayExecutionThreadFunction(); });
+    delayed_thread = ThreadFromGlobalPoolWithoutTracingContext([this] { delayExecutionThreadFunction(); });
 }
 
 
@@ -168,7 +168,7 @@ void BackgroundSchedulePool::increaseThreadsCount(size_t new_threads_count)
 
     threads.resize(new_threads_count);
     for (size_t i = old_threads_count; i < new_threads_count; ++i)
-        threads[i] = ThreadFromGlobalPool([this] { threadFunction(); });
+        threads[i] = ThreadFromGlobalPoolWithoutTracingContext([this] { threadFunction(); });
 }
 
 
diff --git a/src/Core/BackgroundSchedulePool.h b/src/Core/BackgroundSchedulePool.h
index 36cbad145c9..e7abc99a4a8 100644
--- a/src/Core/BackgroundSchedulePool.h
+++ b/src/Core/BackgroundSchedulePool.h
@@ -57,7 +57,9 @@ public:
     ~BackgroundSchedulePool();
 
 private:
-    using Threads = std::vector<ThreadFromGlobalPool>;
+    /// BackgroundSchedulePool schedules a task on its own task queue, there's no need to construct/restore tracing context on this level.
+    /// This is also how ThreadPool class treats the tracing context. See ThreadPool for more information.
+    using Threads = std::vector<ThreadFromGlobalPoolWithoutTracingContext>;
 
     void threadFunction();
     void delayExecutionThreadFunction();
@@ -83,7 +85,7 @@ private:
     std::condition_variable delayed_tasks_cond_var;
     std::mutex delayed_tasks_mutex;
     /// Thread waiting for next delayed task.
-    ThreadFromGlobalPool delayed_thread;
+    ThreadFromGlobalPoolImpl<false> delayed_thread;
     /// Tasks ordered by scheduled time.
     DelayedTasks delayed_tasks;
 
diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
index ee5d97c601b..dde07d4540d 100644
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.reference
@@ -1,4 +1,8 @@
-{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards","shard":"1","rows":"1","bytes":"8"}
-{"operation_name":"void DB::StorageDistributedDirectoryMonitor::processFile(const std::string &)","cluster":"test_cluster_two_shards","shard":"2","rows":"1","bytes":"8"}
-{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards","shard":"1","rows":"1","bytes":"8"}
-{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards","shard":"2","rows":"1","bytes":"8"}
+{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
+{"operation_name":"void DB::DistributedSink::writeToLocal(const Cluster::ShardInfo &, const DB::Block &, size_t)","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
+{"operation_name":"void DB::StorageDistributedDirectoryMonitor::processFile(const std::string &)","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
+{"operation_name":"void DB::StorageDistributedDirectoryMonitor::processFile(const std::string &)","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
+{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
+{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
+{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"1","rows":"1","bytes":"8"}
+{"operation_name":"auto DB::DistributedSink::runWritingJob(DB::DistributedSink::JobReplica &, const DB::Block &, size_t)::(anonymous class)::operator()() const","cluster":"test_cluster_two_shards_localhost","shard":"2","rows":"1","bytes":"8"}
diff --git a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
index 06b35298b89..9ac5f061d4a 100755
--- a/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
+++ b/tests/queries/0_stateless/02417_opentelemetry_insert_on_distributed_table.sh
@@ -6,13 +6,18 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 . "$CURDIR"/../shell_config.sh
 
 
+# This function takes 4 arguments:
+# $1 - OpenTelemetry Trace Id
+# $2 - value of insert_distributed_sync
+# $3 - value of prefer_localhost_replica
+# $4 - a String that helps to debug
 function insert()
 {
-    echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=$2 VALUES(1),(2)" |
+    echo "INSERT INTO ${CLICKHOUSE_DATABASE}.dist_opentelemetry SETTINGS insert_distributed_sync=$2, prefer_localhost_replica=$3 VALUES(1),(2)" |
         ${CLICKHOUSE_CURL} \
             -X POST \
             -H "traceparent: 00-$1-5150000000000515-01" \
-            -H "tracestate: some custom state" \
+            -H "tracestate: $4" \
             "${CLICKHOUSE_URL}" \
             --data @-
 }
@@ -32,6 +37,7 @@ ${CLICKHOUSE_CLIENT} -nq "
     AND   lower(hex(trace_id))                = '${1}'
     AND   attribute['clickhouse.distributed'] = '${CLICKHOUSE_DATABASE}.dist_opentelemetry'
     AND   attribute['clickhouse.remote']      = '${CLICKHOUSE_DATABASE}.local_opentelemetry'
+    ORDER BY attribute['clickhouse.shard_num']
     Format JSONEachRow
     ;"
 }
@@ -44,22 +50,36 @@ ${CLICKHOUSE_CLIENT} -nq "
 DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.dist_opentelemetry;
 DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.local_opentelemetry;
 
-CREATE TABLE ${CLICKHOUSE_DATABASE}.dist_opentelemetry  (key UInt64) Engine=Distributed('test_cluster_two_shards', ${CLICKHOUSE_DATABASE}, local_opentelemetry, key % 2);
+CREATE TABLE ${CLICKHOUSE_DATABASE}.dist_opentelemetry  (key UInt64) Engine=Distributed('test_cluster_two_shards_localhost', ${CLICKHOUSE_DATABASE}, local_opentelemetry, key % 2);
 CREATE TABLE ${CLICKHOUSE_DATABASE}.local_opentelemetry (key UInt64) Engine=MergeTree ORDER BY key;
 "
 
 #
-# ASYNC INSERT test with opentelemetry enabled
+# test1
 #
 trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
-insert $trace_id 0
+insert $trace_id 0 1 "async-insert-writeToLocal"
 check_span $trace_id
 
 #
-# SYNC INSERT SYNC test with opentelemetry enabled
+# test2
 #
 trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
-insert $trace_id 1
+insert $trace_id 0 0 "async-insert-writeToRemote"
+check_span $trace_id
+
+#
+# test3
+#
+trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
+insert $trace_id 1 1  "sync-insert-writeToLocal"
+check_span $trace_id
+
+#
+# test4
+#
+trace_id=$(${CLICKHOUSE_CLIENT} -q "select lower(hex(generateUUIDv4()))");
+insert $trace_id 1 0  "sync-insert-writeToRemote"
 check_span $trace_id
 
 #

From 7d6903bdc0d2b3d43b13bfadb1e048ba403668ad Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 13 Sep 2022 00:06:03 +0800
Subject: [PATCH 36/87] Fix

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 src/Core/BackgroundSchedulePool.cpp | 6 +++---
 src/Core/BackgroundSchedulePool.h   | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp
index b2adc07d92f..29cd3c1c540 100644
--- a/src/Core/BackgroundSchedulePool.cpp
+++ b/src/Core/BackgroundSchedulePool.cpp
@@ -149,9 +149,9 @@ BackgroundSchedulePool::BackgroundSchedulePool(size_t size_, CurrentMetrics::Met
 
     threads.resize(size_);
     for (auto & thread : threads)
-        thread = ThreadFromGlobalPoolWithoutTracingContext([this] { threadFunction(); });
+        thread = ThreadFromGlobalPoolNoTracingContextPropagation([this] { threadFunction(); });
 
-    delayed_thread = ThreadFromGlobalPoolWithoutTracingContext([this] { delayExecutionThreadFunction(); });
+    delayed_thread = ThreadFromGlobalPoolNoTracingContextPropagation([this] { delayExecutionThreadFunction(); });
 }
 
 
@@ -168,7 +168,7 @@ void BackgroundSchedulePool::increaseThreadsCount(size_t new_threads_count)
 
     threads.resize(new_threads_count);
     for (size_t i = old_threads_count; i < new_threads_count; ++i)
-        threads[i] = ThreadFromGlobalPoolWithoutTracingContext([this] { threadFunction(); });
+        threads[i] = ThreadFromGlobalPoolNoTracingContextPropagation([this] { threadFunction(); });
 }
 
 
diff --git a/src/Core/BackgroundSchedulePool.h b/src/Core/BackgroundSchedulePool.h
index e7abc99a4a8..1001d98e643 100644
--- a/src/Core/BackgroundSchedulePool.h
+++ b/src/Core/BackgroundSchedulePool.h
@@ -59,7 +59,7 @@ public:
 private:
     /// BackgroundSchedulePool schedules a task on its own task queue, there's no need to construct/restore tracing context on this level.
     /// This is also how ThreadPool class treats the tracing context. See ThreadPool for more information.
-    using Threads = std::vector<ThreadFromGlobalPoolWithoutTracingContext>;
+    using Threads = std::vector<ThreadFromGlobalPoolNoTracingContextPropagation>;
 
     void threadFunction();
     void delayExecutionThreadFunction();
@@ -85,7 +85,7 @@ private:
     std::condition_variable delayed_tasks_cond_var;
     std::mutex delayed_tasks_mutex;
     /// Thread waiting for next delayed task.
-    ThreadFromGlobalPoolImpl<false> delayed_thread;
+    ThreadFromGlobalPoolNoTracingContextPropagation delayed_thread;
     /// Tasks ordered by scheduled time.
     DelayedTasks delayed_tasks;
 

From 7e1f2901daa68da3765aebfc6efa937f4ad9d03c Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 13 Sep 2022 00:06:17 +0800
Subject: [PATCH 37/87] Fix

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 src/Common/ThreadPool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index 45f3455bf8f..af221449be9 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -270,7 +270,7 @@ protected:
 /// you need to use class, or you need to use ThreadFromGlobalPool below.
 ///
 /// See the comments of ThreadPool below to know how it works.
-using ThreadFromGlobalPoolWithoutTracingContext = ThreadFromGlobalPoolImpl<false>;
+using ThreadFromGlobalPoolNoTracingContextPropagation = ThreadFromGlobalPoolImpl<false>;
 
 /// An alias of thread that execute jobs/tasks on global thread pool by implicit passing tracing context on current thread to underlying worker as parent tracing context.
 /// If jobs/tasks are directly scheduled by using APIs of this class, you need to use this class or you need to use class above.
@@ -288,5 +288,5 @@ using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl<true>;
 ///
 /// To make sure the tracing context is correctly propagated, we explicitly disable context propagation(including initialization and de-initialization) at underlying worker level.
 ///
-using ThreadPool = ThreadPoolImpl<ThreadFromGlobalPoolWithoutTracingContext>;
+using ThreadPool = ThreadPoolImpl<ThreadFromGlobalPoolNoTracingContextPropagation>;
 

From 20191932dfe6a4d1dcb3def80b24f9fe19a99167 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 13 Sep 2022 00:41:05 +0800
Subject: [PATCH 38/87] Fix style

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 src/Common/ThreadPool.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index af221449be9..76ada9e0d75 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -266,7 +266,7 @@ protected:
 
 /// Schedule jobs/tasks on global thread pool without implicit passing tracing context on current thread to underlying worker as parent tracing context.
 ///
-/// If you implement your own job/task scheduling upon global thread pool or schedules a long time running job in a infinite loop way, 
+/// If you implement your own job/task scheduling upon global thread pool or schedules a long time running job in a infinite loop way,
 /// you need to use class, or you need to use ThreadFromGlobalPool below.
 ///
 /// See the comments of ThreadPool below to know how it works.
@@ -289,4 +289,3 @@ using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl<true>;
 /// To make sure the tracing context is correctly propagated, we explicitly disable context propagation(including initialization and de-initialization) at underlying worker level.
 ///
 using ThreadPool = ThreadPoolImpl<ThreadFromGlobalPoolNoTracingContextPropagation>;
-

From a3826c4b3ff1b07a7adba67c295bd7270654ad74 Mon Sep 17 00:00:00 2001
From: DanRoscigno <dan@roscigno.com>
Date: Mon, 12 Sep 2022 13:27:52 -0400
Subject: [PATCH 39/87] move title to frontmatter

---
 docs/en/operations/troubleshooting.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/troubleshooting.md
index 5a61359a2c0..93bd56087a2 100644
--- a/docs/en/operations/troubleshooting.md
+++ b/docs/en/operations/troubleshooting.md
@@ -2,10 +2,9 @@
 slug: /en/operations/troubleshooting
 sidebar_position: 46
 sidebar_label: Troubleshooting
+title: Troubleshooting
 ---
 
-# Troubleshooting
-
 -   [Installation](#troubleshooting-installation-errors)
 -   [Connecting to the server](#troubleshooting-accepts-no-connections)
 -   [Query processing](#troubleshooting-does-not-process-queries)

From b2cc6a8cc6d14cf743c53c29485ea220055529cd Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 12 Sep 2022 18:19:41 +0000
Subject: [PATCH 40/87] Use promise/future

---
 src/Coordination/KeeperDispatcher.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp
index 9684f085f4a..8084bf1d513 100644
--- a/src/Coordination/KeeperDispatcher.cpp
+++ b/src/Coordination/KeeperDispatcher.cpp
@@ -372,6 +372,7 @@ void KeeperDispatcher::shutdown()
 
             if (hasLeader())
             {
+                close_requests.reserve(session_to_response_callback.size());
                 // send to leader CLOSE requests for active sessions
                 for (const auto & [session, response] : session_to_response_callback)
                 {
@@ -395,15 +396,18 @@ void KeeperDispatcher::shutdown()
         {
             LOG_INFO(log, "Trying to close {} session(s)", close_requests.size());
             const auto raft_result = server->putRequestBatch(close_requests);
-            auto sessions_closing_done = std::make_shared<Poco::Event>();
-            raft_result->when_ready([sessions_closing_done](nuraft::cmd_result<nuraft::ptr<nuraft::buffer>> & /*result*/, nuraft::ptr<std::exception> & /*exception*/)
-            {
-                sessions_closing_done->set();
-            });
+            auto sessions_closing_done_promise = std::make_shared<std::promise<void>>();
+            auto sessions_closing_done = sessions_closing_done_promise->get_future();
+            raft_result->when_ready([sessions_closing_done_promise = std::move(sessions_closing_done_promise)](
+                                        nuraft::cmd_result<nuraft::ptr<nuraft::buffer>> & /*result*/,
+                                        nuraft::ptr<std::exception> & /*exception*/) { sessions_closing_done_promise->set_value(); });
 
             auto session_shutdown_timeout = configuration_and_settings->coordination_settings->session_shutdown_timeout.totalMilliseconds();
-            if (!sessions_closing_done->tryWait(session_shutdown_timeout))
-                LOG_WARNING(log, "Failed to close sessions in {}ms. If they are not closed, they will be closed after session timeout.", session_shutdown_timeout);
+            if (sessions_closing_done.wait_for(std::chrono::milliseconds(session_shutdown_timeout)) != std::future_status::ready)
+                LOG_WARNING(
+                    log,
+                    "Failed to close sessions in {}ms. If they are not closed, they will be closed after session timeout.",
+                    session_shutdown_timeout);
         }
 
         if (server)

From 569b4bb63102472328f0ea02bb16cd0532046465 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Mon, 12 Sep 2022 16:16:40 +0200
Subject: [PATCH 41/87] Add ability to automatically comment SQL queries in
 clickhouse-client/local

This is like Alt-# in readline, it is useful when you need to look
something else, and need to save current query/command somewhere, and
commented lin the history is a good option.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 base/base/ReplxxLineReader.cpp | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/base/base/ReplxxLineReader.cpp b/base/base/ReplxxLineReader.cpp
index b7c18110503..75c48f690f8 100644
--- a/base/base/ReplxxLineReader.cpp
+++ b/base/base/ReplxxLineReader.cpp
@@ -220,6 +220,35 @@ ReplxxLineReader::ReplxxLineReader(
     rx.bind_key(Replxx::KEY::control('W'), [this](char32_t code) { return rx.invoke(Replxx::ACTION::KILL_TO_WHITESPACE_ON_LEFT, code); });
 
     rx.bind_key(Replxx::KEY::meta('E'), [this](char32_t) { openEditor(); return Replxx::ACTION_RESULT::CONTINUE; });
+
+    /// readline insert-comment
+    auto insert_comment_action = [this](char32_t code)
+    {
+        replxx::Replxx::State state(rx.get_state());
+        const char * line = state.text();
+        const char * line_end = line + strlen(line);
+
+        std::string commented_line;
+        if (std::find(line, line_end, '\n') != line_end)
+        {
+            /// If query has multiple lines, multiline comment is used over
+            /// commenting each line separately for easier uncomment (though
+            /// with invoking editor it is simpler to uncomment multiple lines)
+            ///
+            /// Note, that using multiline comment is OK even with nested
+            /// comments, since nested comments are supported.
+            commented_line = fmt::format("/* {} */", state.text());
+        }
+        else
+        {
+            // In a simplest case use simple comment.
+            commented_line = fmt::format("-- {}", state.text());
+        }
+        rx.set_state(replxx::Replxx::State(commented_line.c_str(), commented_line.size()));
+
+        return rx.invoke(Replxx::ACTION::COMMIT_LINE, code);
+    };
+    rx.bind_key(Replxx::KEY::meta('#'), insert_comment_action);
 }
 
 ReplxxLineReader::~ReplxxLineReader()

From 456f8a0e629f0747dbaf9a706abb5d2586db34e4 Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Tue, 13 Sep 2022 10:25:52 +0800
Subject: [PATCH 42/87] fix test permission

---
 .../0_stateless/02421_record_errors_row_by_input_format.sh        | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh

diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
old mode 100644
new mode 100755

From 16c8cd0bd383a321f41f0f0af6547b4e58a06521 Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Tue, 13 Sep 2022 14:19:40 +0800
Subject: [PATCH 43/87] wait write finish

---
 src/Processors/Formats/IRowInputFormat.cpp                    | 4 +---
 .../0_stateless/02421_record_errors_row_by_input_format.sh    | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Processors/Formats/IRowInputFormat.cpp b/src/Processors/Formats/IRowInputFormat.cpp
index b3882b43570..52395338279 100644
--- a/src/Processors/Formats/IRowInputFormat.cpp
+++ b/src/Processors/Formats/IRowInputFormat.cpp
@@ -1,9 +1,7 @@
 #include <Processors/Formats/IRowInputFormat.h>
 #include <DataTypes/ObjectUtils.h>
+#include <IO/WriteHelpers.h>    // toString
 #include <IO/WithFileName.h>
-#include <IO/WriteBufferFromString.h>
-#include <IO/Operators.h>
-#include <base/chrono_io.h>
 #include <Common/logger_useful.h>
 
 
diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
index dd782c6ed40..835769e19f1 100755
--- a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
+++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
@@ -19,10 +19,12 @@ ${CLICKHOUSE_CLIENT} --query "create table data (A UInt8, B UInt8) engine=MergeT
 
 # Server side
 ${CLICKHOUSE_CLIENT} --input_format_allow_errors_num 4 --input_format_record_errors_file_path "errors_server" --query "insert into data select * from file('a.csv', 'CSV', 'c1 UInt8, c2 UInt8');"
+sleep 2
 ${CLICKHOUSE_CLIENT} --query "select * except (time) from file('errors_server', 'CSV', 'time DateTime, database Nullable(String), table Nullable(String), offset UInt32, reason String, raw_data String');"
 
 # Client side
 ${CLICKHOUSE_CLIENT} --input_format_allow_errors_num 4 --input_format_record_errors_file_path "${CLICKHOUSE_USER_FILES_PATH}/errors_client" --query "insert into data(A, B) format CSV" < ${CLICKHOUSE_USER_FILES_PATH}/a.csv
+sleep 2
 ${CLICKHOUSE_CLIENT} --query "select * except (time) from file('errors_client', 'CSV', 'time DateTime, database Nullable(String), table Nullable(String), offset UInt32, reason String, raw_data String');"
 
 # Restore

From 67c08e3e22be6de5454a22ef7bbaadac7d0be5bb Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Tue, 13 Sep 2022 15:06:22 +0800
Subject: [PATCH 44/87] sync before destruct

---
 src/Processors/Formats/InputFormatErrorsLogger.cpp | 5 +++++
 src/Processors/Formats/InputFormatErrorsLogger.h   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/Processors/Formats/InputFormatErrorsLogger.cpp b/src/Processors/Formats/InputFormatErrorsLogger.cpp
index 24f526a9b50..8a60c8a07ee 100644
--- a/src/Processors/Formats/InputFormatErrorsLogger.cpp
+++ b/src/Processors/Formats/InputFormatErrorsLogger.cpp
@@ -45,6 +45,11 @@ InputFormatErrorsLogger::InputFormatErrorsLogger(const ContextPtr & context)
     writer = context->getOutputFormat(output_format, *write_buf, header);
 }
 
+InputFormatErrorsLogger::~InputFormatErrorsLogger()
+{
+    write_buf->sync();
+}
+
 void InputFormatErrorsLogger::logErrorImpl(ErrorEntry entry)
 {
     auto error = header.cloneEmpty();
diff --git a/src/Processors/Formats/InputFormatErrorsLogger.h b/src/Processors/Formats/InputFormatErrorsLogger.h
index 20cfb5da133..4b3766f4d37 100644
--- a/src/Processors/Formats/InputFormatErrorsLogger.h
+++ b/src/Processors/Formats/InputFormatErrorsLogger.h
@@ -20,7 +20,7 @@ public:
 
     InputFormatErrorsLogger(const ContextPtr & context);
 
-    virtual ~InputFormatErrorsLogger() = default;
+    virtual ~InputFormatErrorsLogger();
 
     virtual void logError(ErrorEntry entry);
     void logErrorImpl(ErrorEntry entry);

From 5841d9e9b0804dd76e9899857c5e2dee38fc3173 Mon Sep 17 00:00:00 2001
From: zhenjial <zhenjial@foxmail.com>
Date: Tue, 13 Sep 2022 15:53:24 +0800
Subject: [PATCH 45/87] sync before destruct

---
 src/Processors/Formats/InputFormatErrorsLogger.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Processors/Formats/InputFormatErrorsLogger.cpp b/src/Processors/Formats/InputFormatErrorsLogger.cpp
index 8a60c8a07ee..e6f8cdd43ee 100644
--- a/src/Processors/Formats/InputFormatErrorsLogger.cpp
+++ b/src/Processors/Formats/InputFormatErrorsLogger.cpp
@@ -47,7 +47,9 @@ InputFormatErrorsLogger::InputFormatErrorsLogger(const ContextPtr & context)
 
 InputFormatErrorsLogger::~InputFormatErrorsLogger()
 {
-    write_buf->sync();
+    writer->finalize();
+    writer->flush();
+    write_buf->finalize();
 }
 
 void InputFormatErrorsLogger::logErrorImpl(ErrorEntry entry)

From 45cde902190310ad4f189850aa265aabfea48acd Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Tue, 13 Sep 2022 09:51:31 +0000
Subject: [PATCH 46/87] Fix test

---
 tests/integration/test_keeper_session/test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/test_keeper_session/test.py b/tests/integration/test_keeper_session/test.py
index bb72a30359d..4b3aa7e3fdf 100644
--- a/tests/integration/test_keeper_session/test.py
+++ b/tests/integration/test_keeper_session/test.py
@@ -189,3 +189,5 @@ def test_session_close_shutdown(started_cluster):
     node2.stop_clickhouse()
 
     assert node1_zk.exists(eph_node) == None
+
+    node2.start_clickhouse()

From a6b5ffec5d9f9af25f339f9f73d5e382e1bde62d Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Tue, 13 Sep 2022 09:51:46 +0000
Subject: [PATCH 47/87] Polishing

---
 src/Coordination/KeeperDispatcher.cpp | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp
index 8084bf1d513..261e43d80e4 100644
--- a/src/Coordination/KeeperDispatcher.cpp
+++ b/src/Coordination/KeeperDispatcher.cpp
@@ -376,13 +376,15 @@ void KeeperDispatcher::shutdown()
                 // send to leader CLOSE requests for active sessions
                 for (const auto & [session, response] : session_to_response_callback)
                 {
-                    Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
+                    auto request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
                     request->xid = Coordination::CLOSE_XID;
-                    KeeperStorage::RequestForSession request_info;
-                    request_info.request = request;
                     using namespace std::chrono;
-                    request_info.time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
-                    request_info.session_id = session;
+                    KeeperStorage::RequestForSession request_info
+                    {
+                        .session_id = session,
+                        .time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(),
+                        .request = std::move(request),
+                    };
 
                     close_requests.push_back(std::move(request_info));
                 }
@@ -461,13 +463,15 @@ void KeeperDispatcher::sessionCleanerTask()
                     LOG_INFO(log, "Found dead session {}, will try to close it", dead_session);
 
                     /// Close session == send close request to raft server
-                    Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
+                    auto request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
                     request->xid = Coordination::CLOSE_XID;
-                    KeeperStorage::RequestForSession request_info;
-                    request_info.request = request;
                     using namespace std::chrono;
-                    request_info.time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
-                    request_info.session_id = dead_session;
+                    KeeperStorage::RequestForSession request_info
+                    {
+                        .session_id = dead_session,
+                        .time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(),
+                        .request = std::move(request),
+                    };
                     {
                         std::lock_guard lock(push_request_mutex);
                         if (!requests_queue->push(std::move(request_info)))

From 7303ae17966014a021373eebcac1b798af8e6c25 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 13 Sep 2022 18:27:29 +0800
Subject: [PATCH 48/87] Make sure span holder will be destructed only once

---
 src/Common/OpenTelemetryTraceContext.cpp | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp
index 7a1f94926d5..d5c2188ad01 100644
--- a/src/Common/OpenTelemetryTraceContext.cpp
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@@ -130,16 +130,15 @@ void SpanHolder::finish() noexcept
     try
     {
         auto log = current_thread_trace_context.span_log.lock();
-        if (!log)
+
+        /// The log might be disabled, check it before use
+        if (log)
         {
-            // The log might be disabled.
-            return;
+            this->finish_time_us
+                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+            log->add(OpenTelemetrySpanLogElement(*this));
         }
-
-        this->finish_time_us
-            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-
-        log->add(OpenTelemetrySpanLogElement(*this));
     }
     catch (...)
     {

From d3265150c03f4769d316b90c588b6764b4873430 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 13 Sep 2022 18:28:21 +0800
Subject: [PATCH 49/87] Make sure span is finished in the onFinish callback

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 src/Interpreters/executeQuery.cpp | 202 +++++++++++++++---------------
 1 file changed, 101 insertions(+), 101 deletions(-)

diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp
index c501c1722ba..b6434955418 100644
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@@ -838,101 +838,117 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             {
                 QueryStatus * process_list_elem = context->getProcessListElement();
 
-                if (!process_list_elem)
-                    return;
-
-                /// Update performance counters before logging to query_log
-                CurrentThread::finalizePerformanceCounters();
-
-                QueryStatusInfo info = process_list_elem->getInfo(true, context->getSettingsRef().log_profile_events);
-
-                double elapsed_seconds = info.elapsed_seconds;
-
-                elem.type = QueryLogElementType::QUERY_FINISH;
-
-                // construct event_time and event_time_microseconds using the same time point
-                // so that the two times will always be equal up to a precision of a second.
-                const auto finish_time = std::chrono::system_clock::now();
-                elem.event_time = time_in_seconds(finish_time);
-                elem.event_time_microseconds = time_in_microseconds(finish_time);
-                status_info_to_query_log(elem, info, ast, context);
-
-                if (pulling_pipeline)
+                if (process_list_elem)
                 {
-                    query_pipeline.tryGetResultRowsAndBytes(elem.result_rows, elem.result_bytes);
-                }
-                else /// will be used only for ordinary INSERT queries
-                {
-                    auto progress_out = process_list_elem->getProgressOut();
-                    elem.result_rows = progress_out.written_rows;
-                    elem.result_bytes = progress_out.written_bytes;
-                }
+                    /// Update performance counters before logging to query_log
+                    CurrentThread::finalizePerformanceCounters();
 
-                auto progress_callback = context->getProgressCallback();
-                if (progress_callback)
-                {
-                    Progress p(WriteProgress{info.written_rows, info.written_bytes});
-                    p.incrementPiecewiseAtomically(Progress{ResultProgress{elem.result_rows, elem.result_bytes}});
-                    progress_callback(p);
-                }
+                    QueryStatusInfo info = process_list_elem->getInfo(true, context->getSettingsRef().log_profile_events);
 
-                if (elem.read_rows != 0)
-                {
-                    LOG_INFO(&Poco::Logger::get("executeQuery"), "Read {} rows, {} in {} sec., {} rows/sec., {}/sec.",
-                        elem.read_rows, ReadableSize(elem.read_bytes), elapsed_seconds,
-                        static_cast<size_t>(elem.read_rows / elapsed_seconds),
-                        ReadableSize(elem.read_bytes / elapsed_seconds));
-                }
+                    double elapsed_seconds = info.elapsed_seconds;
 
-                if (log_queries && elem.type >= log_queries_min_type && static_cast<Int64>(elem.query_duration_ms) >= log_queries_min_query_duration_ms)
-                {
-                    if (auto query_log = context->getQueryLog())
-                        query_log->add(elem);
-                }
-                if (log_processors_profiles)
-                {
-                    if (auto processors_profile_log = context->getProcessorsProfileLog())
+                    elem.type = QueryLogElementType::QUERY_FINISH;
+
+                    // construct event_time and event_time_microseconds using the same time point
+                    // so that the two times will always be equal up to a precision of a second.
+                    const auto finish_time = std::chrono::system_clock::now();
+                    elem.event_time = time_in_seconds(finish_time);
+                    elem.event_time_microseconds = time_in_microseconds(finish_time);
+                    status_info_to_query_log(elem, info, ast, context);
+
+                    if (pulling_pipeline)
                     {
-                        ProcessorProfileLogElement processor_elem;
-                        processor_elem.event_time = time_in_seconds(finish_time);
-                        processor_elem.event_time_microseconds = time_in_microseconds(finish_time);
-                        processor_elem.query_id = elem.client_info.current_query_id;
+                        query_pipeline.tryGetResultRowsAndBytes(elem.result_rows, elem.result_bytes);
+                    }
+                    else /// will be used only for ordinary INSERT queries
+                    {
+                        auto progress_out = process_list_elem->getProgressOut();
+                        elem.result_rows = progress_out.written_rows;
+                        elem.result_bytes = progress_out.written_bytes;
+                    }
 
-                        auto get_proc_id = [](const IProcessor & proc) -> UInt64
-                        {
-                            return reinterpret_cast<std::uintptr_t>(&proc);
-                        };
+                    auto progress_callback = context->getProgressCallback();
+                    if (progress_callback)
+                    {
+                        Progress p(WriteProgress{info.written_rows, info.written_bytes});
+                        p.incrementPiecewiseAtomically(Progress{ResultProgress{elem.result_rows, elem.result_bytes}});
+                        progress_callback(p);
+                    }
 
-                        for (const auto & processor : query_pipeline.getProcessors())
+                    if (elem.read_rows != 0)
+                    {
+                        LOG_INFO(&Poco::Logger::get("executeQuery"), "Read {} rows, {} in {} sec., {} rows/sec., {}/sec.",
+                            elem.read_rows, ReadableSize(elem.read_bytes), elapsed_seconds,
+                            static_cast<size_t>(elem.read_rows / elapsed_seconds),
+                            ReadableSize(elem.read_bytes / elapsed_seconds));
+                    }
+
+                    if (log_queries && elem.type >= log_queries_min_type && static_cast<Int64>(elem.query_duration_ms) >= log_queries_min_query_duration_ms)
+                    {
+                        if (auto query_log = context->getQueryLog())
+                            query_log->add(elem);
+                    }
+                    if (log_processors_profiles)
+                    {
+                        if (auto processors_profile_log = context->getProcessorsProfileLog())
                         {
-                            std::vector<UInt64> parents;
-                            for (const auto & port : processor->getOutputs())
+                            ProcessorProfileLogElement processor_elem;
+                            processor_elem.event_time = time_in_seconds(finish_time);
+                            processor_elem.event_time_microseconds = time_in_microseconds(finish_time);
+                            processor_elem.query_id = elem.client_info.current_query_id;
+
+                            auto get_proc_id = [](const IProcessor & proc) -> UInt64
                             {
-                                if (!port.isConnected())
-                                    continue;
-                                const IProcessor & next = port.getInputPort().getProcessor();
-                                parents.push_back(get_proc_id(next));
+                                return reinterpret_cast<std::uintptr_t>(&proc);
+                            };
+
+                            for (const auto & processor : query_pipeline.getProcessors())
+                            {
+                                std::vector<UInt64> parents;
+                                for (const auto & port : processor->getOutputs())
+                                {
+                                    if (!port.isConnected())
+                                        continue;
+                                    const IProcessor & next = port.getInputPort().getProcessor();
+                                    parents.push_back(get_proc_id(next));
+                                }
+
+                                processor_elem.id = get_proc_id(*processor);
+                                processor_elem.parent_ids = std::move(parents);
+
+                                processor_elem.plan_step = reinterpret_cast<std::uintptr_t>(processor->getQueryPlanStep());
+                                processor_elem.plan_group = processor->getQueryPlanStepGroup();
+
+                                processor_elem.processor_name = processor->getName();
+
+                                processor_elem.elapsed_us = processor->getElapsedUs();
+                                processor_elem.input_wait_elapsed_us = processor->getInputWaitElapsedUs();
+                                processor_elem.output_wait_elapsed_us = processor->getOutputWaitElapsedUs();
+
+                                auto stats = processor->getProcessorDataStats();
+                                processor_elem.input_rows = stats.input_rows;
+                                processor_elem.input_bytes = stats.input_bytes;
+                                processor_elem.output_rows = stats.output_rows;
+                                processor_elem.output_bytes = stats.output_bytes;
+
+                                processors_profile_log->add(processor_elem);
                             }
+                        }
+                    }
 
-                            processor_elem.id = get_proc_id(*processor);
-                            processor_elem.parent_ids = std::move(parents);
-
-                            processor_elem.plan_step = reinterpret_cast<std::uintptr_t>(processor->getQueryPlanStep());
-                            processor_elem.plan_group = processor->getQueryPlanStepGroup();
-
-                            processor_elem.processor_name = processor->getName();
-
-                            processor_elem.elapsed_us = processor->getElapsedUs();
-                            processor_elem.input_wait_elapsed_us = processor->getInputWaitElapsedUs();
-                            processor_elem.output_wait_elapsed_us = processor->getOutputWaitElapsedUs();
-
-                            auto stats = processor->getProcessorDataStats();
-                            processor_elem.input_rows = stats.input_rows;
-                            processor_elem.input_bytes = stats.input_bytes;
-                            processor_elem.output_rows = stats.output_rows;
-                            processor_elem.output_bytes = stats.output_bytes;
-
-                            processors_profile_log->add(processor_elem);
+                    if (implicit_txn_control)
+                    {
+                        try
+                        {
+                            implicit_txn_control->executeCommit(context->getSessionContext());
+                            implicit_txn_control.reset();
+                        }
+                        catch (const Exception &)
+                        {
+                            /// An exception might happen when trying to commit the transaction. For example we might get an immediate exception
+                            /// because ZK is down and wait_changes_become_visible_after_commit_mode == WAIT_UNKNOWN
+                            implicit_txn_control.reset();
+                            throw;
                         }
                     }
                 }
@@ -945,27 +961,11 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                     query_span->addAttributeIfNotEmpty("clickhouse.tracestate", OpenTelemetry::CurrentContext().tracestate);
                     query_span->addAttributeIfNotZero("clickhouse.read_rows", elem.read_rows);
                     query_span->addAttributeIfNotZero("clickhouse.read_bytes", elem.read_bytes);
-                    query_span->addAttributeIfNotZero("clickhouse.written_rows", info.written_rows);
+                    query_span->addAttributeIfNotZero("clickhouse.written_rows", elem.written_rows);
                     query_span->addAttributeIfNotZero("clickhouse.written_bytes", elem.written_bytes);
                     query_span->addAttributeIfNotZero("clickhouse.memory_usage", elem.memory_usage);
                     query_span->finish();
                 }
-
-                if (implicit_txn_control)
-                {
-                    try
-                    {
-                        implicit_txn_control->executeCommit(context->getSessionContext());
-                        implicit_txn_control.reset();
-                    }
-                    catch (const Exception &)
-                    {
-                        /// An exception might happen when trying to commit the transaction. For example we might get an immediate exception
-                        /// because ZK is down and wait_changes_become_visible_after_commit_mode == WAIT_UNKNOWN
-                        implicit_txn_control.reset();
-                        throw;
-                    }
-                }
             };
 
             auto exception_callback = [elem,

From 0c37c95c22754fc75884289f2b12d9d18af145d3 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 13 Sep 2022 18:28:36 +0800
Subject: [PATCH 50/87] Add test cases

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 ...simple_queries_for_opentelemetry.reference |  4 +
 .../02421_simple_queries_for_opentelemetry.sh | 83 +++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference
 create mode 100755 tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh

diff --git a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference
new file mode 100644
index 00000000000..6b614bd6d82
--- /dev/null
+++ b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference
@@ -0,0 +1,4 @@
+{"query":"show processlist format Null\n  "}
+{"query":"show databases format Null\n  "}
+{"query":"insert into opentelemetry_test values","read_rows":"3","read_bytes":"24","written_rows":"3","written_bytes":"24"}
+{"query":"select * from opentelemetry_test format Null\n  ","read_rows":"3","read_bytes":"24","written_rows":"","written_bytes":""}
diff --git a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
new file mode 100755
index 00000000000..c6f8bddf571
--- /dev/null
+++ b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+. "$CURDIR"/../shell_config.sh
+
+# This function takes 2 arguments:
+# $1 - query
+# $2 - trace id
+function executeQuery()
+{
+  ${CLICKHOUSE_CLIENT} --database=${CLICKHOUSE_DATABASE} --opentelemetry_start_trace_probability=1 --query_id $1 -nq "
+      ${2}
+  "
+}
+
+# For some quries, it's not able to know how many bytes/rows are read when tests are executed on CI,
+# so we only to check the db.statement only
+function check_query_span_query_only()
+{
+${CLICKHOUSE_CLIENT} -nq "
+    SYSTEM FLUSH LOGS;
+    SELECT attribute['db.statement']       as query
+    FROM system.opentelemetry_span_log
+    WHERE finish_date                      >= yesterday()
+    AND   operation_name                   = 'query'
+    AND   attribute['clickhouse.query_id'] = '${1}'
+    Format JSONEachRow
+    ;"
+}
+
+function check_query_span()
+{
+${CLICKHOUSE_CLIENT} -nq "
+    SYSTEM FLUSH LOGS;
+    SELECT attribute['db.statement']             as query,
+           attribute['clickhouse.read_rows']     as read_rows,
+           attribute['clickhouse.read_bytes']    as read_bytes,
+           attribute['clickhouse.written_rows']  as written_rows,
+           attribute['clickhouse.written_bytes'] as written_bytes
+    FROM system.opentelemetry_span_log
+    WHERE finish_date                      >= yesterday()
+    AND   operation_name                   = 'query'
+    AND   attribute['clickhouse.query_id'] = '${1}'
+    Format JSONEachRow
+    ;"
+}
+
+#
+# Set up
+#
+${CLICKHOUSE_CLIENT} --database=${CLICKHOUSE_DATABASE} -nq "
+DROP   TABLE IF EXISTS opentelemetry_test;
+CREATE TABLE opentelemetry_test (id UInt64) Engine=MergeTree Order By id;
+"
+
+# test 1, a query that has special path in the code
+# Format Null is used to make sure no output is generated so that it won't pollute the reference file
+query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
+executeQuery $query_id 'show processlist format Null'
+check_query_span_query_only "$query_id"
+
+# test 2, a normal show command
+query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
+executeQuery $query_id 'show databases format Null'
+check_query_span_query_only "$query_id"
+
+# test 3, a normal insert query on local table
+query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
+executeQuery $query_id 'insert into opentelemetry_test values(1)(2)(3)'
+check_query_span "$query_id"
+
+# test 4, a normal select query
+query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
+executeQuery $query_id 'select * from opentelemetry_test format Null'
+check_query_span $query_id
+
+
+#
+# Tear down
+#
+${CLICKHOUSE_CLIENT} --database=${CLICKHOUSE_DATABASE} -q "
+DROP   TABLE IF EXISTS opentelemetry_test;
+"
\ No newline at end of file

From 9af591a32847dbb60d95d1376ec36d3825e9ebd1 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 13 Sep 2022 12:29:02 +0200
Subject: [PATCH 51/87] Revert "Sharding s3 key names (2)"

---
 .../ObjectStorages/S3/S3ObjectStorage.cpp     | 15 +---
 src/Disks/ObjectStorages/StoredObject.h       |  1 -
 tests/integration/runner                      |  3 +-
 .../test_join_set_family_s3/test.py           |  2 +-
 tests/integration/test_log_family_s3/test.py  |  2 +-
 tests/integration/test_merge_tree_s3/test.py  | 68 ++++++++-----------
 .../test_profile_events_s3/test.py            | 24 +++----
 .../test_replicated_merge_tree_s3/test.py     |  8 +--
 .../test.py                                   |  8 +--
 .../test_s3_zero_copy_replication/test.py     |  4 +-
 10 files changed, 51 insertions(+), 84 deletions(-)

diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
index 45304ac2fac..998b521cc56 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@@ -31,7 +31,6 @@
 #include <Common/logger_useful.h>
 #include <Common/MultiVersion.h>
 
-
 namespace DB
 {
 
@@ -91,19 +90,7 @@ void logIfError(const Aws::Utils::Outcome<Result, Error> & response, std::functi
 
 std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path */)
 {
-    /// Path to store the new S3 object.
-
-    /// Total length is 32 a-z characters for enough randomness.
-    /// First 3 characters are used as a prefix for
-    /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/
-
-    constexpr size_t key_name_total_size = 32;
-    constexpr size_t key_name_prefix_size = 3;
-
-    /// Path to store new S3 object.
-    return fmt::format("{}/{}",
-        getRandomASCIIString(key_name_prefix_size),
-        getRandomASCIIString(key_name_total_size - key_name_prefix_size));
+    return getRandomASCIIString(32);
 }
 
 Aws::S3::Model::HeadObjectOutcome S3ObjectStorage::requestObjectHeadData(const std::string & bucket_from, const std::string & key) const
diff --git a/src/Disks/ObjectStorages/StoredObject.h b/src/Disks/ObjectStorages/StoredObject.h
index d9faa766540..acb8a5fd127 100644
--- a/src/Disks/ObjectStorages/StoredObject.h
+++ b/src/Disks/ObjectStorages/StoredObject.h
@@ -3,7 +3,6 @@
 #include <string>
 #include <Disks/ObjectStorages/IObjectStorage_fwd.h>
 
-
 namespace DB
 {
 
diff --git a/tests/integration/runner b/tests/integration/runner
index e1b9a55b43e..f0d87b23a83 100755
--- a/tests/integration/runner
+++ b/tests/integration/runner
@@ -350,7 +350,8 @@ if __name__ == "__main__":
         # randomizer, we should remove it after Sep 2022
         try:
             subprocess.check_call(
-                f"docker volume ls -q | grep '{VOLUME_NAME}_.*_volume' | xargs --no-run-if-empty docker volume rm",
+                "docker volume rm $(docker volume ls -q | "
+                f"grep '{VOLUME_NAME}_.*_volume')",
                 shell=True,
             )
         except Exception as ex:
diff --git a/tests/integration/test_join_set_family_s3/test.py b/tests/integration/test_join_set_family_s3/test.py
index 38b56b7b15b..b09d5735628 100644
--- a/tests/integration/test_join_set_family_s3/test.py
+++ b/tests/integration/test_join_set_family_s3/test.py
@@ -27,7 +27,7 @@ def cluster():
 
 def assert_objects_count(cluster, objects_count, path="data/"):
     minio = cluster.minio_client
-    s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True))
+    s3_objects = list(minio.list_objects(cluster.minio_bucket, path))
     if objects_count != len(s3_objects):
         for s3_object in s3_objects:
             object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name)
diff --git a/tests/integration/test_log_family_s3/test.py b/tests/integration/test_log_family_s3/test.py
index bed379d098b..76ff0930db3 100644
--- a/tests/integration/test_log_family_s3/test.py
+++ b/tests/integration/test_log_family_s3/test.py
@@ -25,7 +25,7 @@ def cluster():
 
 def assert_objects_count(cluster, objects_count, path="data/"):
     minio = cluster.minio_client
-    s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True))
+    s3_objects = list(minio.list_objects(cluster.minio_bucket, path))
     if objects_count != len(s3_objects):
         for s3_object in s3_objects:
             object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name)
diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py
index 4276125c347..544f064bdff 100644
--- a/tests/integration/test_merge_tree_s3/test.py
+++ b/tests/integration/test_merge_tree_s3/test.py
@@ -120,17 +120,11 @@ def run_s3_mocks(cluster):
 def wait_for_delete_s3_objects(cluster, expected, timeout=30):
     minio = cluster.minio_client
     while timeout > 0:
-        if (
-            len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-            == expected
-        ):
+        if len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == expected:
             return
         timeout -= 1
         time.sleep(1)
-    assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == expected
-    )
+    assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == expected
 
 
 @pytest.fixture(autouse=True)
@@ -146,9 +140,7 @@ def drop_table(cluster, node_name):
         wait_for_delete_s3_objects(cluster, 0)
     finally:
         # Remove extra objects to prevent tests cascade failing
-        for obj in list(
-            minio.list_objects(cluster.minio_bucket, "data/", recursive=True)
-        ):
+        for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
             minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -170,7 +162,7 @@ def test_simple_insert_select(
     node.query("INSERT INTO s3_test VALUES {}".format(values1))
     assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values") == values1
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + files_per_part
     )
 
@@ -181,7 +173,7 @@ def test_simple_insert_select(
         == values1 + "," + values2
     )
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + files_per_part * 2
     )
 
@@ -225,7 +217,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name):
         node.query("SELECT count(distinct(id)) FROM s3_test FORMAT Values") == "(8192)"
     )
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD_PER_PART_WIDE * 6 + FILES_OVERHEAD
     )
 
@@ -314,28 +306,28 @@ def test_attach_detach_partition(cluster, node_name):
     )
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test DETACH PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test ATTACH PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test DROP PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE
     )
 
@@ -346,8 +338,7 @@ def test_attach_detach_partition(cluster, node_name):
     )
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
     )
 
 
@@ -365,21 +356,21 @@ def test_move_partition_to_another_disk(cluster, node_name):
     )
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE
     )
 
     node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 's3'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
@@ -400,7 +391,7 @@ def test_table_manipulations(cluster, node_name):
     node.query("RENAME TABLE s3_test TO s3_renamed")
     assert node.query("SELECT count(*) FROM s3_renamed FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
     node.query("RENAME TABLE s3_renamed TO s3_test")
@@ -411,15 +402,14 @@ def test_table_manipulations(cluster, node_name):
     node.query("ATTACH TABLE s3_test")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("TRUNCATE TABLE s3_test")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
     )
 
 
@@ -444,7 +434,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
@@ -458,7 +448,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT count(*) FROM s3_clone FORMAT Values") == "(8192)"
     # Number of objects in S3 should be unchanged.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
@@ -472,7 +462,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 6
     )
 
@@ -493,14 +483,14 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
     # Data should remain in S3
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
     node.query("ALTER TABLE s3_test FREEZE")
     # Number S3 objects should be unchanged.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
@@ -509,7 +499,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
 
     wait_for_delete_s3_objects(cluster, FILES_OVERHEAD_PER_PART_WIDE * 4)
 
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -530,7 +520,7 @@ def test_freeze_unfreeze(cluster, node_name):
 
     node.query("TRUNCATE TABLE s3_test")
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
@@ -543,8 +533,7 @@ def test_freeze_unfreeze(cluster, node_name):
 
     # Data should be removed from S3.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
     )
 
 
@@ -567,7 +556,7 @@ def test_freeze_system_unfreeze(cluster, node_name):
     node.query("TRUNCATE TABLE s3_test")
     node.query("DROP TABLE s3_test_removed NO DELAY")
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
@@ -576,8 +565,7 @@ def test_freeze_system_unfreeze(cluster, node_name):
 
     # Data should be removed from S3.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
-        == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
     )
 
 
@@ -704,7 +692,7 @@ def test_lazy_seek_optimization_for_async_read(cluster, node_name):
     node.query("SELECT * FROM s3_test WHERE value LIKE '%abc%' ORDER BY value LIMIT 10")
     node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
     minio = cluster.minio_client
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
diff --git a/tests/integration/test_profile_events_s3/test.py b/tests/integration/test_profile_events_s3/test.py
index 18f1c5ee9ad..a0f664df000 100644
--- a/tests/integration/test_profile_events_s3/test.py
+++ b/tests/integration/test_profile_events_s3/test.py
@@ -62,7 +62,7 @@ init_list = {
 def get_s3_events(instance):
     result = init_list.copy()
     events = instance.query(
-        "SELECT event, value FROM system.events WHERE event LIKE '%S3%'"
+        "SELECT event,value FROM system.events WHERE event LIKE '%S3%'"
     ).split("\n")
     for event in events:
         ev = event.split("\t")
@@ -85,20 +85,20 @@ def get_minio_stat(cluster):
         )
     ).text.split("\n")
     for line in stat:
-        x = re.search(r"s3_requests_total(\{.*\})?\s(\d+)(\s.*)?", line)
+        x = re.search("s3_requests_total(\{.*\})?\s(\d+)(\s.*)?", line)
         if x != None:
             y = re.search('.*api="(get|list|head|select).*', x.group(1))
             if y != None:
                 result["get_requests"] += int(x.group(2))
             else:
                 result["set_requests"] += int(x.group(2))
-        x = re.search(r"s3_errors_total(\{.*\})?\s(\d+)(\s.*)?", line)
+        x = re.search("s3_errors_total(\{.*\})?\s(\d+)(\s.*)?", line)
         if x != None:
             result["errors"] += int(x.group(2))
-        x = re.search(r"s3_rx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
+        x = re.search("s3_rx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
         if x != None:
             result["tx_bytes"] += float(x.group(2))
-        x = re.search(r"s3_tx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
+        x = re.search("s3_tx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
         if x != None:
             result["rx_bytes"] += float(x.group(2))
     return result
@@ -128,10 +128,8 @@ def get_query_stat(instance, hint):
 def get_minio_size(cluster):
     minio = cluster.minio_client
     size = 0
-    for obj_level1 in minio.list_objects(
-        cluster.minio_bucket, prefix="data/", recursive=True
-    ):
-        size += obj_level1.size
+    for obj in minio.list_objects(cluster.minio_bucket, "data/"):
+        size += obj.size
     return size
 
 
@@ -147,7 +145,7 @@ def test_profile_events(cluster):
     metrics0 = get_s3_events(instance)
     minio0 = get_minio_stat(cluster)
 
-    query1 = "CREATE TABLE test_s3.test_s3 (key UInt32, value UInt32) ENGINE=MergeTree PRIMARY KEY key ORDER BY key SETTINGS storage_policy = 's3'"
+    query1 = "CREATE TABLE test_s3.test_s3 (key UInt32, value UInt32) ENGINE=MergeTree PRIMARY KEY key ORDER BY key SETTINGS storage_policy='s3'"
     instance.query(query1)
 
     size1 = get_minio_size(cluster)
@@ -169,7 +167,7 @@ def test_profile_events(cluster):
         metrics1["WriteBufferFromS3Bytes"] - metrics0["WriteBufferFromS3Bytes"] == size1
     )
 
-    query2 = "INSERT INTO test_s3.test_s3 VALUES"
+    query2 = "INSERT INTO test_s3.test_s3 FORMAT Values"
     instance.query(query2 + " (1,1)")
 
     size2 = get_minio_size(cluster)
@@ -184,12 +182,9 @@ def test_profile_events(cluster):
         metrics2["S3WriteRequestsCount"] - metrics1["S3WriteRequestsCount"]
         == minio2["set_requests"] - minio1["set_requests"]
     )
-
     stat2 = get_query_stat(instance, query2)
-
     for metric in stat2:
         assert stat2[metric] == metrics2[metric] - metrics1[metric]
-
     assert (
         metrics2["WriteBufferFromS3Bytes"] - metrics1["WriteBufferFromS3Bytes"]
         == size2 - size1
@@ -210,7 +205,6 @@ def test_profile_events(cluster):
         == minio3["set_requests"] - minio2["set_requests"]
     )
     stat3 = get_query_stat(instance, query3)
-
     # With async reads profile events are not updated fully because reads are done in a separate thread.
     # for metric in stat3:
     #    print(metric)
diff --git a/tests/integration/test_replicated_merge_tree_s3/test.py b/tests/integration/test_replicated_merge_tree_s3/test.py
index 0d978bb6967..37027d07969 100644
--- a/tests/integration/test_replicated_merge_tree_s3/test.py
+++ b/tests/integration/test_replicated_merge_tree_s3/test.py
@@ -113,7 +113,7 @@ def drop_table(cluster):
 
     minio = cluster.minio_client
     # Remove extra objects to prevent tests cascade failing
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -130,9 +130,9 @@ def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_par
     insert(cluster, node_idxs=[1, 2, 3], verify=True)
 
     minio = cluster.minio_client
-    assert len(
-        list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))
-    ) == 3 * (FILES_OVERHEAD + files_per_part * 3)
+    assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == 3 * (
+        FILES_OVERHEAD + files_per_part * 3
+    )
 
 
 def test_drop_cache_on_cluster(cluster):
diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py
index 60a1b9b9746..73b611ad169 100644
--- a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py
+++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py
@@ -87,7 +87,7 @@ def drop_table(cluster):
 
     minio = cluster.minio_client
     # Remove extra objects to prevent tests cascade failing
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -124,6 +124,6 @@ def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_par
         )
 
     minio = cluster.minio_client
-    assert len(
-        list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))
-    ) == (3 * FILES_OVERHEAD) + (files_per_part * 3)
+    assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == (
+        3 * FILES_OVERHEAD
+    ) + (files_per_part * 3)
diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py
index 860b83d4ed1..7b7fb9d21ad 100644
--- a/tests/integration/test_s3_zero_copy_replication/test.py
+++ b/tests/integration/test_s3_zero_copy_replication/test.py
@@ -39,9 +39,7 @@ def cluster():
 def get_large_objects_count(cluster, size=100, folder="data"):
     minio = cluster.minio_client
     counter = 0
-    for obj in minio.list_objects(
-        cluster.minio_bucket, "{}/".format(folder), recursive=True
-    ):
+    for obj in minio.list_objects(cluster.minio_bucket, "{}/".format(folder)):
         if obj.size is not None and obj.size >= size:
             counter = counter + 1
     return counter

From b80eb57153d51f851c1ab8ffc3181237dd91ad2a Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Tue, 13 Sep 2022 12:40:36 +0200
Subject: [PATCH 52/87] Don't run test in fasttest

---
 .../0_stateless/02421_record_errors_row_by_input_format.sh      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
index 835769e19f1..dda61512936 100755
--- a/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
+++ b/tests/queries/0_stateless/02421_record_errors_row_by_input_format.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Tags: no-parallel
+# Tags: no-parallel, no-fasttest
 
 set -eu
 

From bd773e6918c63cb4186afb45f8384a45e7bba597 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Tue, 13 Sep 2022 15:03:07 +0200
Subject: [PATCH 53/87] Fix download_binary, use proper version and commit

---
 .github/workflows/release.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ae905aa62ba..001f6d9e669 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -30,10 +30,11 @@ jobs:
         cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
         cd "$REPO_COPY"
         # Download and push packages to artifactory
-        python3 ./tests/ci/push_to_artifactory.py --release "${{ github.ref }}" \
-          --commit '${{ github.sha }}' --artifactory-url "${{ secrets.JFROG_ARTIFACTORY_URL }}" --all
+        python3 ./tests/ci/push_to_artifactory.py --release '${{ github.ref }}' \
+          --commit '${{ github.sha }}' --artifactory-url '${{ secrets.JFROG_ARTIFACTORY_URL }}' --all
         # Download macos binaries to ${{runner.temp}}/download_binary
-        python3 ./tests/ci/download_binary.py binary_darwin binary_darwin_aarch64
+        python3 ./tests/ci/download_binary.py --version '${{ github.ref }}' \
+          --commit '${{ github.sha }}' binary_darwin binary_darwin_aarch64
         mv '${{runner.temp}}/download_binary/'clickhouse-* '${{runner.temp}}/push_to_artifactory'
     - name: Upload packages to release assets
       uses: svenstaro/upload-release-action@v2

From f8dcf01aa6e853b52757d573b3cd0032a93aa11b Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Tue, 13 Sep 2022 21:04:39 +0800
Subject: [PATCH 54/87] Fix shellcheck

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 .../0_stateless/02421_simple_queries_for_opentelemetry.sh        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
index c6f8bddf571..b270c809d77 100755
--- a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
+++ b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
 
 # This function takes 2 arguments:

From f2acb53887807ce0d1b681a455b0be77589500d1 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Tue, 13 Sep 2022 13:49:51 +0000
Subject: [PATCH 55/87] Better log message for replicas number in
 StorageReplicatedMerge

---
 src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 13 ++++++++++---
 src/Storages/MergeTree/ReplicatedMergeTreeSink.h   |  1 +
 src/Storages/StorageReplicatedMergeTree.cpp        |  2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
index 6c7fbcb52d8..c72394cc854 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@@ -203,11 +203,11 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
             }
 
             block_id = temp_part.part->getZeroLevelPartBlockID(block_dedup_token);
-            LOG_DEBUG(log, "Wrote block with ID '{}', {} rows on {} replicas", block_id, current_block.block.rows(), replicas_num);
+            LOG_DEBUG(log, "Wrote block with ID '{}', {} rows{}", block_id, current_block.block.rows(), quorumLogMessage(replicas_num));
         }
         else
         {
-            LOG_DEBUG(log, "Wrote block with {} rows on {} replicas", current_block.block.rows(), replicas_num);
+            LOG_DEBUG(log, "Wrote block with {} rows{}", current_block.block.rows(), quorumLogMessage(replicas_num));
         }
 
         UInt64 elapsed_ns = watch.elapsed();
@@ -639,7 +639,7 @@ void ReplicatedMergeTreeSink::waitForQuorum(
     size_t replicas_num) const
 {
     /// We are waiting for quorum to be satisfied.
-    LOG_TRACE(log, "Waiting for quorum '{}' for part {} on {} replicas", quorum_path, part_name, replicas_num);
+    LOG_TRACE(log, "Waiting for quorum '{}' for part {}{}", quorum_path, part_name, quorumLogMessage(replicas_num));
 
     try
     {
@@ -684,6 +684,13 @@ void ReplicatedMergeTreeSink::waitForQuorum(
     LOG_TRACE(log, "Quorum '{}' for part {} satisfied", quorum_path, part_name);
 }
 
+String ReplicatedMergeTreeSink::quorumLogMessage(size_t replicas_num) const
+{
+    if (!isQuorumEnabled())
+        return "";
+    return fmt::format(" (quorum {} of {} replicas)", getQuorumSize(replicas_num), replicas_num);
+}
+
 size_t ReplicatedMergeTreeSink::getQuorumSize(size_t replicas_num) const
 {
     if (!isQuorumEnabled())
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
index 48e94ef5659..ab729e6edec 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
@@ -96,6 +96,7 @@ private:
 
     size_t getQuorumSize(size_t replicas_num) const;
     bool isQuorumEnabled() const;
+    String quorumLogMessage(size_t replicas_num) const; /// Used in logs for debug purposes
 
     size_t quorum_timeout_ms;
     size_t max_parts_per_block;
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index c8eef26440e..1f6d850f90b 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -3649,7 +3649,7 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_
         if (quorum_entry.replicas.size() >= quorum_entry.required_number_of_replicas)
         {
             /// The quorum is reached. Delete the node, and update information about the last part that was successfully written with quorum.
-            LOG_TRACE(log, "Got {} (of {}) replicas confirmed quorum {}, going to remove node",
+            LOG_TRACE(log, "Got {} (of {} required) replicas confirmed quorum {}, going to remove node",
                       quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas, quorum_status_path);
 
             Coordination::Requests ops;

From 72a77ba8c10683f5eea32fc0df4ab5559abda0ec Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Tue, 13 Sep 2022 13:38:45 +0200
Subject: [PATCH 56/87] Update VERSION in server Dockerfiles on new tag

---
 .github/workflows/tags_stable.yml            |  1 +
 utils/list-versions/update-docker-version.sh | 10 ++++++++++
 2 files changed, 11 insertions(+)
 create mode 100755 utils/list-versions/update-docker-version.sh

diff --git a/.github/workflows/tags_stable.yml b/.github/workflows/tags_stable.yml
index 9711f7688cb..a9172a8a2e2 100644
--- a/.github/workflows/tags_stable.yml
+++ b/.github/workflows/tags_stable.yml
@@ -43,6 +43,7 @@ jobs:
         GITHUB_TOKEN: ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }}
       run: |
         ./utils/list-versions/list-versions.sh > ./utils/list-versions/version_date.tsv
+        ./utils/list-versions/update-docker-version.sh
         GID=$(id -g "${UID}")
         docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 \
             --volume="${GITHUB_WORKSPACE}:/ClickHouse" clickhouse/style-test \
diff --git a/utils/list-versions/update-docker-version.sh b/utils/list-versions/update-docker-version.sh
new file mode 100755
index 00000000000..429da330a9f
--- /dev/null
+++ b/utils/list-versions/update-docker-version.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -e
+
+# We check only our code, that's why we skip contrib
+GIT_ROOT=$(git rev-parse --show-cdup)
+GIT_ROOT=${GIT_ROOT:-.}
+VERSION=$(sed -e '1 s/^v//; 1 s/-.*//p; d' "$GIT_ROOT"/utils/list-versions/version_date.tsv)
+
+find "$GIT_ROOT/docker/server/" -name 'Dockerfile.*' -print0 | xargs -0 sed -i "/^ARG VERSION=/ s/^.*$/ARG VERSION=\"$VERSION\"/"

From dbe919e992152991e88c209f096120a2c60ac884 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Tue, 13 Sep 2022 13:39:32 +0200
Subject: [PATCH 57/87] Update server Dockerfiles to the latest version

---
 docker/server/Dockerfile.alpine | 2 +-
 docker/server/Dockerfile.ubuntu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine
index b01dba1e22f..1a672f30a74 100644
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@@ -33,7 +33,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="20.9.3.45"
+ARG VERSION="22.8.5.29"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 
 # user/group precreated explicitly with fixed uid/gid on purpose.
diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu
index f4102a6ccaf..db76a9fab1d 100644
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@@ -21,7 +21,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
 
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION=22.6.1.*
+ARG VERSION="22.8.5.29"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 
 # set non-empty deb_location_url url to create a docker image

From fc2e7159f01069b37fb8912f6a689c2ea0575dde Mon Sep 17 00:00:00 2001
From: DanRoscigno <dan@roscigno.com>
Date: Tue, 13 Sep 2022 12:40:16 -0400
Subject: [PATCH 58/87] add note about privs

---
 docs/en/sql-reference/statements/delete.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 487dfc87f9a..18ebc241d51 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -35,3 +35,9 @@ An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER T
 Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
 :::
 
+:::note
+DELETE FROM requires the ALTER DELETE privledge:
+```sql
+grant ALTER DELETE ON db.table to username;
+```
+:::

From 64bd1ea77ae5e3b79d5a89c418c922314f3e9ff0 Mon Sep 17 00:00:00 2001
From: DanRoscigno <dan@roscigno.com>
Date: Tue, 13 Sep 2022 12:42:24 -0400
Subject: [PATCH 59/87] spelling

---
 docs/en/sql-reference/statements/delete.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 18ebc241d51..0a886618348 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -36,7 +36,7 @@ Even though deletes are becoming more lightweight in ClickHouse, they should sti
 :::
 
 :::note
-DELETE FROM requires the ALTER DELETE privledge:
+DELETE FROM requires the ALTER DELETE privilege:
 ```sql
 grant ALTER DELETE ON db.table to username;
 ```

From e85b2221363cd296fe47ff1aba8c2b597f48b181 Mon Sep 17 00:00:00 2001
From: Rich Raposa <richraposa@gmail.com>
Date: Tue, 13 Sep 2022 10:54:47 -0600
Subject: [PATCH 60/87] Update delete.md

---
 docs/en/sql-reference/statements/delete.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 0a886618348..65cd073e530 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -32,7 +32,7 @@ SET allow_experimental_lightweight_delete = true;
 An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
 
 :::warning
-Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
+Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as an OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
 :::
 
 :::note

From b833c39dcf577710993de9bb760b83c97ff95f13 Mon Sep 17 00:00:00 2001
From: Rich Raposa <richraposa@gmail.com>
Date: Tue, 13 Sep 2022 10:55:26 -0600
Subject: [PATCH 61/87] Update delete.md

---
 docs/en/sql-reference/statements/delete.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 65cd073e530..ef3487ff754 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -36,7 +36,7 @@ Even though deletes are becoming more lightweight in ClickHouse, they should sti
 :::
 
 :::note
-DELETE FROM requires the ALTER DELETE privilege:
+`DELETE FROM` requires the `ALTER DELETE` privilege:
 ```sql
 grant ALTER DELETE ON db.table to username;
 ```

From 078b9bff3c4c23add4395b1c8b044f74af3c9b8f Mon Sep 17 00:00:00 2001
From: Dan Roscigno <dan@roscigno.com>
Date: Tue, 13 Sep 2022 13:18:01 -0400
Subject: [PATCH 62/87] Update docs/en/sql-reference/statements/delete.md

---
 docs/en/sql-reference/statements/delete.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index ef3487ff754..0dc6cc0d09a 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -32,7 +32,7 @@ SET allow_experimental_lightweight_delete = true;
 An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
 
 :::warning
-Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as an OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
+Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on an OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
 :::
 
 :::note

From 49136d26aa22d4aa0385d0895f7c2456c559de8b Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Sep 2022 19:45:46 +0000
Subject: [PATCH 63/87] docs: fix internal links

The links at the very beginning of
https://clickhouse.com/docs/en/sql-reference/statements/system
don't work. They are reference other sections of the same document. This
is weird because there is a small index already on the right side. I
searched our documentation and this seems to be the only pages which do
so. Therefore removing the links altogether instead of fixing them.
---
 docs/en/sql-reference/statements/system.md | 39 ----------------------
 docs/ru/sql-reference/statements/system.md | 37 --------------------
 docs/zh/sql-reference/statements/system.md | 32 ------------------
 3 files changed, 108 deletions(-)

diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md
index 9b7527caaa9..be7bbc94ec5 100644
--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@@ -6,45 +6,6 @@ sidebar_label: SYSTEM
 
 # SYSTEM Statements
 
-The list of available `SYSTEM` statements:
-
--   [RELOAD EMBEDDED DICTIONARIES](#query_language-system-reload-emdedded-dictionaries)
--   [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries)
--   [RELOAD DICTIONARY](#query_language-system-reload-dictionary)
--   [RELOAD MODELS](#query_language-system-reload-models)
--   [RELOAD MODEL](#query_language-system-reload-model)
--   [RELOAD FUNCTIONS](#query_language-system-reload-functions)
--   [RELOAD FUNCTION](#query_language-system-reload-functions)
--   [DROP DNS CACHE](#query_language-system-drop-dns-cache)
--   [DROP MARK CACHE](#query_language-system-drop-mark-cache)
--   [DROP UNCOMPRESSED CACHE](#query_language-system-drop-uncompressed-cache)
--   [DROP COMPILED EXPRESSION CACHE](#query_language-system-drop-compiled-expression-cache)
--   [DROP REPLICA](#query_language-system-drop-replica)
--   [FLUSH LOGS](#query_language-system-flush_logs)
--   [RELOAD CONFIG](#query_language-system-reload-config)
--   [SHUTDOWN](#query_language-system-shutdown)
--   [KILL](#query_language-system-kill)
--   [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends)
--   [FLUSH DISTRIBUTED](#query_language-system-flush-distributed)
--   [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends)
--   [STOP MERGES](#query_language-system-stop-merges)
--   [START MERGES](#query_language-system-start-merges)
--   [STOP TTL MERGES](#query_language-stop-ttl-merges)
--   [START TTL MERGES](#query_language-start-ttl-merges)
--   [STOP MOVES](#query_language-stop-moves)
--   [START MOVES](#query_language-start-moves)
--   [SYSTEM UNFREEZE](#query_language-system-unfreeze)
--   [STOP FETCHES](#query_language-system-stop-fetches)
--   [START FETCHES](#query_language-system-start-fetches)
--   [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends)
--   [START REPLICATED SENDS](#query_language-system-start-replicated-sends)
--   [STOP REPLICATION QUEUES](#query_language-system-stop-replication-queues)
--   [START REPLICATION QUEUES](#query_language-system-start-replication-queues)
--   [SYNC REPLICA](#query_language-system-sync-replica)
--   [RESTART REPLICA](#query_language-system-restart-replica)
--   [RESTORE REPLICA](#query_language-system-restore-replica)
--   [RESTART REPLICAS](#query_language-system-restart-replicas)
-
 ## RELOAD EMBEDDED DICTIONARIES
 
 Reload all [Internal dictionaries](../../sql-reference/dictionaries/internal-dicts.md).
diff --git a/docs/ru/sql-reference/statements/system.md b/docs/ru/sql-reference/statements/system.md
index c1dc03a63d1..1fbae741dad 100644
--- a/docs/ru/sql-reference/statements/system.md
+++ b/docs/ru/sql-reference/statements/system.md
@@ -6,43 +6,6 @@ sidebar_label: SYSTEM
 
 # Запросы SYSTEM {#query-language-system}
 
--   [RELOAD EMBEDDED DICTIONARIES](#query_language-system-reload-emdedded-dictionaries)
--   [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries)
--   [RELOAD DICTIONARY](#query_language-system-reload-dictionary)
--   [RELOAD MODELS](#query_language-system-reload-models)
--   [RELOAD MODEL](#query_language-system-reload-model)
--   [RELOAD FUNCTIONS](#query_language-system-reload-functions)
--   [RELOAD FUNCTION](#query_language-system-reload-functions)
--   [DROP DNS CACHE](#query_language-system-drop-dns-cache)
--   [DROP MARK CACHE](#query_language-system-drop-mark-cache)
--   [DROP UNCOMPRESSED CACHE](#query_language-system-drop-uncompressed-cache)
--   [DROP COMPILED EXPRESSION CACHE](#query_language-system-drop-compiled-expression-cache)
--   [DROP REPLICA](#query_language-system-drop-replica)
--   [FLUSH LOGS](#query_language-system-flush_logs)
--   [RELOAD CONFIG](#query_language-system-reload-config)
--   [SHUTDOWN](#query_language-system-shutdown)
--   [KILL](#query_language-system-kill)
--   [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends)
--   [FLUSH DISTRIBUTED](#query_language-system-flush-distributed)
--   [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends)
--   [STOP MERGES](#query_language-system-stop-merges)
--   [START MERGES](#query_language-system-start-merges)
--   [STOP TTL MERGES](#query_language-stop-ttl-merges)
--   [START TTL MERGES](#query_language-start-ttl-merges)
--   [STOP MOVES](#query_language-stop-moves)
--   [START MOVES](#query_language-start-moves)
--   [SYSTEM UNFREEZE](#query_language-system-unfreeze)
--   [STOP FETCHES](#query_language-system-stop-fetches)
--   [START FETCHES](#query_language-system-start-fetches)
--   [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends)
--   [START REPLICATED SENDS](#query_language-system-start-replicated-sends)
--   [STOP REPLICATION QUEUES](#query_language-system-stop-replication-queues)
--   [START REPLICATION QUEUES](#query_language-system-start-replication-queues)
--   [SYNC REPLICA](#query_language-system-sync-replica)
--   [RESTART REPLICA](#query_language-system-restart-replica)
--   [RESTORE REPLICA](#query_language-system-restore-replica)
--   [RESTART REPLICAS](#query_language-system-restart-replicas)
-
 ## RELOAD EMBEDDED DICTIONARIES] {#query_language-system-reload-emdedded-dictionaries}
 Перегружает все [Встроенные словари](../dictionaries/internal-dicts.md).
 По умолчанию встроенные словари выключены.
diff --git a/docs/zh/sql-reference/statements/system.md b/docs/zh/sql-reference/statements/system.md
index d833887a9c6..3df00cf8854 100644
--- a/docs/zh/sql-reference/statements/system.md
+++ b/docs/zh/sql-reference/statements/system.md
@@ -6,38 +6,6 @@ sidebar_label: SYSTEM
 
 # SYSTEM Queries {#query-language-system}
 
--   [RELOAD EMBEDDED DICTIONARIES](#query_language-system-reload-emdedded-dictionaries)
--   [RELOAD DICTIONARIES](#query_language-system-reload-dictionaries)
--   [RELOAD DICTIONARY](#query_language-system-reload-dictionary)
--   [DROP DNS CACHE](#query_language-system-drop-dns-cache)
--   [DROP MARK CACHE](#query_language-system-drop-mark-cache)
--   [DROP UNCOMPRESSED CACHE](#query_language-system-drop-uncompressed-cache)
--   [DROP COMPILED EXPRESSION CACHE](#query_language-system-drop-compiled-expression-cache)
--   [DROP REPLICA](#query_language-system-drop-replica)
--   [FLUSH LOGS](#query_language-system-flush_logs)
--   [RELOAD CONFIG](#query_language-system-reload-config)
--   [SHUTDOWN](#query_language-system-shutdown)
--   [KILL](#query_language-system-kill)
--   [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends)
--   [FLUSH DISTRIBUTED](#query_language-system-flush-distributed)
--   [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends)
--   [STOP MERGES](#query_language-system-stop-merges)
--   [START MERGES](#query_language-system-start-merges)
--   [STOP TTL MERGES](#query_language-stop-ttl-merges)
--   [START TTL MERGES](#query_language-start-ttl-merges)
--   [STOP MOVES](#query_language-stop-moves)
--   [START MOVES](#query_language-start-moves)
--   [SYSTEM UNFREEZE](#query_language-system-unfreeze)
--   [STOP FETCHES](#query_language-system-stop-fetches)
--   [START FETCHES](#query_language-system-start-fetches)
--   [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends)
--   [START REPLICATED SENDS](#query_language-system-start-replicated-sends)
--   [STOP REPLICATION QUEUES](#query_language-system-stop-replication-queues)
--   [START REPLICATION QUEUES](#query_language-system-start-replication-queues)
--   [SYNC REPLICA](#query_language-system-sync-replica)
--   [RESTART REPLICA](#query_language-system-restart-replica)
--   [RESTART REPLICAS](#query_language-system-restart-replicas)
-
 ## RELOAD EMBEDDED DICTIONARIES\] {#query_language-system-reload-emdedded-dictionaries}
 
 重新加载所有[内置字典](../../sql-reference/dictionaries/internal-dicts.md)。默认情况下内置字典是禁用的。

From f35296eea6a9cf6ec157a801d0dd7d7015f89206 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Mon, 12 Sep 2022 22:03:20 +0200
Subject: [PATCH 64/87] Add files with revision to ignore for git blame

This will improve output of the blame.

Can be configured as follow:

  $ git config blame.ignoreRevsFile .git-blame-ignore-revs

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 .git-blame-ignore-revs | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .git-blame-ignore-revs

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 00000000000..06e893fabb3
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,15 @@
+# This is a file that can be used by git-blame to ignore some revisions.
+# (git 2.23+, released in August 2019)
+#
+# Can be configured as follow:
+#
+#     $ git config blame.ignoreRevsFile .git-blame-ignore-revs
+#
+# For more information you can look at git-blame(1) man page.
+
+# Changed tabs to spaces in code [#CLICKHOUSE-3]
+137ad95929ee016cc6d3c03bccb5586941c163ff
+
+# dbms/ → src/
+# (though it is unlikely that you will see it in blame)
+06446b4f08a142d6f1bc30664c47ded88ab51782

From ad9e7be56617f51afd1d4e6c86c23444ad6040ad Mon Sep 17 00:00:00 2001
From: Dan Roscigno <dan@roscigno.com>
Date: Tue, 13 Sep 2022 17:23:41 -0400
Subject: [PATCH 65/87] Update README.md

When running `changelog.py` I did not have tags and needed to supply a `--from` with a commit SHA.  Having the tags allows the script to go back to the last release.
---
 utils/changelog/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/changelog/README.md b/utils/changelog/README.md
index 8218af83d96..7ecc727d248 100644
--- a/utils/changelog/README.md
+++ b/utils/changelog/README.md
@@ -13,6 +13,7 @@ python3 changelog.py -h
 Usage example:
 
 ```
+git fetch --all # changelog.py depends on having the tags available, this will fetch them
 python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$GITHUB_TOKEN" v21.6.2.7-prestable
 python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$USER" --gh-password="$PASSWORD" v21.6.2.7-prestable
 ```

From ee5445174189c78507b83482e0c7815cb584dd39 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Tue, 13 Sep 2022 22:43:59 +0000
Subject: [PATCH 66/87] fix adding of columns of type Object

---
 src/DataTypes/ObjectUtils.cpp                 | 11 +++++
 src/DataTypes/ObjectUtils.h                   | 40 ++++++++++---------
 src/Storages/MergeTree/MergeTreeData.h        |  6 +--
 .../MergeTree/MergeTreeDataWriter.cpp         | 10 -----
 src/Storages/MergeTree/MergeTreeDataWriter.h  |  2 -
 src/Storages/MergeTree/MergeTreeSink.cpp      |  3 +-
 .../MergeTree/ReplicatedMergeTreeSink.cpp     |  3 +-
 src/Storages/StorageMergeTree.cpp             |  7 ++++
 src/Storages/StorageReplicatedMergeTree.cpp   |  7 ++++
 .../01825_type_json_add_column.reference.j2   |  6 +++
 .../01825_type_json_add_column.sql.j2         | 23 +++++++++++
 11 files changed, 82 insertions(+), 36 deletions(-)
 create mode 100644 tests/queries/0_stateless/01825_type_json_add_column.reference.j2
 create mode 100644 tests/queries/0_stateless/01825_type_json_add_column.sql.j2

diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp
index 3cf557ec5bf..c14b9b579ea 100644
--- a/src/DataTypes/ObjectUtils.cpp
+++ b/src/DataTypes/ObjectUtils.cpp
@@ -1,3 +1,4 @@
+#include <Storages/StorageSnapshot.h>
 #include <DataTypes/ObjectUtils.h>
 #include <DataTypes/DataTypeObject.h>
 #include <DataTypes/DataTypeNothing.h>
@@ -159,6 +160,16 @@ void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_st
     }
 }
 
+void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block)
+{
+    if (!storage_snapshot->object_columns.empty())
+    {
+        auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects();
+        auto storage_columns = storage_snapshot->getColumns(options);
+        convertObjectsToTuples(block, storage_columns);
+    }
+}
+
 static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts)
 {
     if (prefix.size() > parts.size())
diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h
index 2dde0ed3e65..f5ab1813e00 100644
--- a/src/DataTypes/ObjectUtils.h
+++ b/src/DataTypes/ObjectUtils.h
@@ -11,6 +11,9 @@
 namespace DB
 {
 
+struct StorageSnapshot;
+using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
+
 /// Returns number of dimensions in Array type. 0 if type is not array.
 size_t getNumberOfDimensions(const IDataType & type);
 
@@ -38,6 +41,7 @@ DataTypePtr getDataTypeByColumn(const IColumn & column);
 /// Converts Object types and columns to Tuples in @columns_list and @block
 /// and checks that types are consistent with types in @extended_storage_columns.
 void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_storage_columns);
+void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block);
 
 /// Checks that each path is not the prefix of any other path.
 void checkObjectHasNoAmbiguosPaths(const PathsInData & paths);
@@ -164,27 +168,24 @@ ColumnsDescription getObjectColumns(
     const ColumnsDescription & storage_columns,
     EntryColumnsGetter && entry_columns_getter)
 {
-    ColumnsDescription res;
-
-    if (begin == end)
-    {
-        for (const auto & column : storage_columns)
-        {
-            if (isObject(column.type))
-            {
-                auto tuple_type = std::make_shared<DataTypeTuple>(
-                    DataTypes{std::make_shared<DataTypeUInt8>()},
-                    Names{ColumnObject::COLUMN_NAME_DUMMY});
-
-                res.add({column.name, std::move(tuple_type)});
-            }
-        }
-
-        return res;
-    }
-
     std::unordered_map<String, DataTypes> types_in_entries;
 
+    /// Add dummy column for all Object columns
+    /// to not to lost any column if it's missing
+    /// in all entries. If it exists in any entry
+    /// dummy column will be removed.
+    for (const auto & column : storage_columns)
+    {
+        if (isObject(column.type))
+        {
+            auto tuple_type = std::make_shared<DataTypeTuple>(
+                DataTypes{std::make_shared<DataTypeUInt8>()},
+                Names{ColumnObject::COLUMN_NAME_DUMMY});
+
+            types_in_entries[column.name].push_back(std::move(tuple_type));
+        }
+    }
+
     for (auto it = begin; it != end; ++it)
     {
         const auto & entry_columns = entry_columns_getter(*it);
@@ -196,6 +197,7 @@ ColumnsDescription getObjectColumns(
         }
     }
 
+    ColumnsDescription res;
     for (const auto & [name, types] : types_in_entries)
         res.add({name, getLeastCommonTypeForObject(types)});
 
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 4158517fc23..0eae622d9fb 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -1242,6 +1242,9 @@ protected:
     /// Attaches restored parts to the storage.
     virtual void attachRestoredParts(MutableDataPartsVector && parts) = 0;
 
+    void resetObjectColumnsFromActiveParts(const DataPartsLock & lock);
+    void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock);
+
     static void incrementInsertedPartsProfileEvent(MergeTreeDataPartType type);
     static void incrementMergedPartsProfileEvent(MergeTreeDataPartType type);
 
@@ -1329,9 +1332,6 @@ private:
         DataPartsVector & duplicate_parts_to_remove,
         MutableDataPartsVector & parts_from_wal);
 
-    void resetObjectColumnsFromActiveParts(const DataPartsLock & lock);
-    void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock);
-
     /// Create zero-copy exclusive lock for part and disk. Useful for coordination of
     /// distributed operations which can lead to data duplication. Implemented only in ReplicatedMergeTree.
     virtual std::optional<ZeroCopyLock> tryCreateZeroCopyExclusiveLock(const String &, const DiskPtr &) { return std::nullopt; }
diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
index 97900eef22b..95faef6aac7 100644
--- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
@@ -483,16 +483,6 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart(
     return temp_part;
 }
 
-void MergeTreeDataWriter::deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block)
-{
-    if (!storage_snapshot->object_columns.empty())
-    {
-        auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects();
-        auto storage_columns = storage_snapshot->getColumns(options);
-        convertObjectsToTuples(block, storage_columns);
-    }
-}
-
 MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl(
     const String & part_name,
     MergeTreeDataPartType part_type,
diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h
index 2f9ab1aae8b..00438a29fa1 100644
--- a/src/Storages/MergeTree/MergeTreeDataWriter.h
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.h
@@ -45,8 +45,6 @@ public:
       */
     static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context);
 
-    static void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block);
-
     /// This structure contains not completely written temporary part.
     /// Some writes may happen asynchronously, e.g. for blob storages.
     /// You should call finalize() to wait until all data is written.
diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp
index 5eaa8ec8004..5d00db861a8 100644
--- a/src/Storages/MergeTree/MergeTreeSink.cpp
+++ b/src/Storages/MergeTree/MergeTreeSink.cpp
@@ -1,6 +1,7 @@
 #include <Storages/MergeTree/MergeTreeSink.h>
 #include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
 #include <Storages/StorageMergeTree.h>
+#include <DataTypes/ObjectUtils.h>
 #include <Interpreters/PartLog.h>
 
 namespace ProfileEvents
@@ -56,7 +57,7 @@ void MergeTreeSink::consume(Chunk chunk)
 {
     auto block = getHeader().cloneWithColumns(chunk.detachColumns());
 
-    storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
+    deduceTypesOfObjectColumns(storage_snapshot, block);
     auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);
 
     using DelayedPartitions = std::vector<MergeTreeSink::DelayedChunk::Partition>;
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
index 6c7fbcb52d8..89a34a39c13 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@@ -1,6 +1,7 @@
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeQuorumEntry.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeSink.h>
+#include <DataTypes/ObjectUtils.h>
 #include <Interpreters/PartLog.h>
 #include <Common/SipHash.h>
 #include <Common/ZooKeeper/KeeperException.h>
@@ -161,7 +162,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
       */
     size_t replicas_num = checkQuorumPrecondition(zookeeper);
 
-    storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
+    deduceTypesOfObjectColumns(storage_snapshot, block);
     auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);
 
     using DelayedPartitions = std::vector<ReplicatedMergeTreeSink::DelayedChunk::Partition>;
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 5adc1974257..e4062734352 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -335,6 +335,13 @@ void StorageMergeTree::alter(
                 mutation_version = startMutation(maybe_mutation_commands, local_context);
         }
 
+        {
+            /// Reset Object columns, because column of type
+            /// Object may be added or dropped by alter.
+            auto parts_lock = lockParts();
+            resetObjectColumnsFromActiveParts(parts_lock);
+        }
+
         /// Always execute required mutations synchronously, because alters
         /// should be executed in sequential order.
         if (!maybe_mutation_commands.empty())
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index a2d10e57f8f..20567846924 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -4649,6 +4649,13 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer
         LOG_INFO(log, "Applied changes to the metadata of the table. Current metadata version: {}", metadata_version);
     }
 
+    {
+        /// Reset Object columns, because column of type
+        /// Object may be added or dropped by alter.
+        auto parts_lock = lockParts();
+        resetObjectColumnsFromActiveParts(parts_lock);
+    }
+
     /// This transaction may not happen, but it's OK, because on the next retry we will eventually create/update this node
     /// TODO Maybe do in in one transaction for Replicated database?
     zookeeper->createOrUpdate(fs::path(replica_path) / "metadata_version", std::to_string(metadata_version), zkutil::CreateMode::Persistent);
diff --git a/tests/queries/0_stateless/01825_type_json_add_column.reference.j2 b/tests/queries/0_stateless/01825_type_json_add_column.reference.j2
new file mode 100644
index 00000000000..da724aef01a
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_add_column.reference.j2
@@ -0,0 +1,6 @@
+{% for storage in ["MergeTree", "ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_add_column/', 'r1')"] -%}
+{"id":"1","s":{"k1":0}}
+{"id":"2","s":{"k1":100}}
+{"id":"1"}
+{"id":"2"}
+{% endfor -%}
diff --git a/tests/queries/0_stateless/01825_type_json_add_column.sql.j2 b/tests/queries/0_stateless/01825_type_json_add_column.sql.j2
new file mode 100644
index 00000000000..87c76c042a6
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_add_column.sql.j2
@@ -0,0 +1,23 @@
+-- Tags: no-fasttest
+
+{% for storage in ["MergeTree", "ReplicatedMergeTree('/clickhouse/tables/{database}/test_01825_add_column/', 'r1')"] -%}
+
+DROP TABLE IF EXISTS t_json_add_column;
+SET allow_experimental_object_type = 1;
+
+CREATE TABLE t_json_add_column (id UInt64) ENGINE = {{ storage }} ORDER BY tuple();
+
+INSERT INTO t_json_add_column VALUES (1);
+ALTER TABLE t_json_add_column ADD COLUMN s JSON;
+
+INSERT INTO t_json_add_column VALUES(2, '{"k1": 100}');
+
+SELECT * FROM t_json_add_column ORDER BY id FORMAT JSONEachRow;
+
+ALTER TABLE t_json_add_column DROP COLUMN s;
+
+SELECT * FROM t_json_add_column ORDER BY id FORMAT JSONEachRow;
+
+DROP TABLE t_json_add_column;
+
+{% endfor -%}

From 6e5ba2648d769f5f544ac09fc23d859674fbfa31 Mon Sep 17 00:00:00 2001
From: Denny Crane <denis.zhuravlov@gmail.com>
Date: Tue, 13 Sep 2022 20:54:56 -0300
Subject: [PATCH 67/87] Update clickhouse-server.init

---
 packages/clickhouse-server.init | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/packages/clickhouse-server.init b/packages/clickhouse-server.init
index 1695f6286b8..13aeffe13a7 100755
--- a/packages/clickhouse-server.init
+++ b/packages/clickhouse-server.init
@@ -47,9 +47,10 @@ CLICKHOUSE_PIDFILE="$CLICKHOUSE_PIDDIR/$PROGRAM.pid"
 # Some systems lack "flock"
 command -v flock >/dev/null && FLOCK=flock
 
-# Override defaults from optional config file
+# Override defaults from optional config file and export them automatically
+set -a
 test -f /etc/default/clickhouse && . /etc/default/clickhouse
-
+set +a
 
 die()
 {

From 569167d9aa99c71ccd1a46d1ffc689f94bb993f3 Mon Sep 17 00:00:00 2001
From: Frank Chen <frank.chen021@outlook.com>
Date: Wed, 14 Sep 2022 11:43:23 +0800
Subject: [PATCH 68/87] Improve the test to address review comments

Signed-off-by: Frank Chen <frank.chen021@outlook.com>
---
 ...simple_queries_for_opentelemetry.reference |  4 +--
 .../02421_simple_queries_for_opentelemetry.sh | 32 +++++++++----------
 2 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference
index 6b614bd6d82..d167d905636 100644
--- a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference
+++ b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.reference
@@ -1,4 +1,4 @@
 {"query":"show processlist format Null\n  "}
 {"query":"show databases format Null\n  "}
-{"query":"insert into opentelemetry_test values","read_rows":"3","read_bytes":"24","written_rows":"3","written_bytes":"24"}
-{"query":"select * from opentelemetry_test format Null\n  ","read_rows":"3","read_bytes":"24","written_rows":"","written_bytes":""}
+{"query":"insert into opentelemetry_test values","read_rows":"3","written_rows":"3"}
+{"query":"select * from opentelemetry_test format Null\n  ","read_rows":"3","written_rows":""}
diff --git a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
index b270c809d77..98b571c5968 100755
--- a/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
+++ b/tests/queries/0_stateless/02421_simple_queries_for_opentelemetry.sh
@@ -5,16 +5,16 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 . "$CURDIR"/../shell_config.sh
 
 # This function takes 2 arguments:
-# $1 - query
-# $2 - trace id
-function executeQuery()
+# $1 - query id
+# $2 - query
+function execute_query()
 {
-  ${CLICKHOUSE_CLIENT} --database=${CLICKHOUSE_DATABASE} --opentelemetry_start_trace_probability=1 --query_id $1 -nq "
+  ${CLICKHOUSE_CLIENT} --opentelemetry_start_trace_probability=1 --query_id $1 -nq "
       ${2}
   "
 }
 
-# For some quries, it's not able to know how many bytes/rows are read when tests are executed on CI,
+# For some queries, it's not possible to know how many bytes/rows are read when tests are executed on CI,
 # so we only to check the db.statement only
 function check_query_span_query_only()
 {
@@ -35,9 +35,7 @@ ${CLICKHOUSE_CLIENT} -nq "
     SYSTEM FLUSH LOGS;
     SELECT attribute['db.statement']             as query,
            attribute['clickhouse.read_rows']     as read_rows,
-           attribute['clickhouse.read_bytes']    as read_bytes,
-           attribute['clickhouse.written_rows']  as written_rows,
-           attribute['clickhouse.written_bytes'] as written_bytes
+           attribute['clickhouse.written_rows']  as written_rows
     FROM system.opentelemetry_span_log
     WHERE finish_date                      >= yesterday()
     AND   operation_name                   = 'query'
@@ -49,36 +47,36 @@ ${CLICKHOUSE_CLIENT} -nq "
 #
 # Set up
 #
-${CLICKHOUSE_CLIENT} --database=${CLICKHOUSE_DATABASE} -nq "
-DROP   TABLE IF EXISTS opentelemetry_test;
-CREATE TABLE opentelemetry_test (id UInt64) Engine=MergeTree Order By id;
+${CLICKHOUSE_CLIENT} -nq "
+DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.opentelemetry_test;
+CREATE TABLE ${CLICKHOUSE_DATABASE}.opentelemetry_test (id UInt64) Engine=MergeTree Order By id;
 "
 
 # test 1, a query that has special path in the code
 # Format Null is used to make sure no output is generated so that it won't pollute the reference file
 query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
-executeQuery $query_id 'show processlist format Null'
+execute_query $query_id 'show processlist format Null'
 check_query_span_query_only "$query_id"
 
 # test 2, a normal show command
 query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
-executeQuery $query_id 'show databases format Null'
+execute_query $query_id 'show databases format Null'
 check_query_span_query_only "$query_id"
 
 # test 3, a normal insert query on local table
 query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
-executeQuery $query_id 'insert into opentelemetry_test values(1)(2)(3)'
+execute_query $query_id 'insert into opentelemetry_test values(1)(2)(3)'
 check_query_span "$query_id"
 
 # test 4, a normal select query
 query_id=$(${CLICKHOUSE_CLIENT} -q "select generateUUIDv4()");
-executeQuery $query_id 'select * from opentelemetry_test format Null'
+execute_query $query_id 'select * from opentelemetry_test format Null'
 check_query_span $query_id
 
 
 #
 # Tear down
 #
-${CLICKHOUSE_CLIENT} --database=${CLICKHOUSE_DATABASE} -q "
-DROP   TABLE IF EXISTS opentelemetry_test;
+${CLICKHOUSE_CLIENT} -q "
+DROP TABLE IF EXISTS ${CLICKHOUSE_DATABASE}.opentelemetry_test;
 "
\ No newline at end of file

From bc111b56805c41a7905d78ea540c610a0e478a0b Mon Sep 17 00:00:00 2001
From: young scott <young_scott@163.com>
Date: Wed, 14 Sep 2022 09:15:40 +0000
Subject: [PATCH 69/87] fix issuse#41096

---
 src/Interpreters/DDLWorker.h                        | 7 ++++++-
 src/Storages/System/StorageSystemDDLWorkerQueue.cpp | 6 +++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h
index 7ddcc80c02a..e3c1fa4c271 100644
--- a/src/Interpreters/DDLWorker.h
+++ b/src/Interpreters/DDLWorker.h
@@ -61,18 +61,23 @@ public:
         return host_fqdn_id;
     }
 
+    std::string getQueueDir() const
+    {
+        return queue_dir;
+    }
+
     void startup();
     virtual void shutdown();
 
     bool isCurrentlyActive() const { return initialized && !stop_flag; }
 
-protected:
 
     /// Returns cached ZooKeeper session (possibly expired).
     ZooKeeperPtr tryGetZooKeeper() const;
     /// If necessary, creates a new session and caches it.
     ZooKeeperPtr getAndSetZooKeeper();
 
+protected:
     /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks
     void scheduleTasks(bool reinitialized);
 
diff --git a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp
index 111ea343398..185a6be6a70 100644
--- a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp
+++ b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp
@@ -205,9 +205,9 @@ static void fillStatusColumns(MutableColumns & res_columns, size_t & col,
 
 void StorageSystemDDLWorkerQueue::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const
 {
-    zkutil::ZooKeeperPtr zookeeper = context->getZooKeeper();
-    fs::path ddl_zookeeper_path = context->getConfigRef().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/");
-
+    auto& ddl_worker = context->getDDLWorker();
+    fs::path ddl_zookeeper_path = ddl_worker.getQueueDir();  
+    zkutil::ZooKeeperPtr zookeeper = ddl_worker.getAndSetZooKeeper();
     Strings ddl_task_paths = zookeeper->getChildren(ddl_zookeeper_path);
 
     GetResponseFutures ddl_task_futures;

From ea31302071803a51201239d8039e13ba78954bfd Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Wed, 14 Sep 2022 11:30:06 +0200
Subject: [PATCH 70/87] Fix

---
 src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
index f18debe8a8b..1853b5b4dd7 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@@ -131,9 +131,6 @@ DiskObjectStorageMetadata::DiskObjectStorageMetadata(
 
 void DiskObjectStorageMetadata::addObject(const String & path, size_t size)
 {
-    if (!object_storage_root_path.empty() && path.starts_with(object_storage_root_path))
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected relative path");
-
     total_size += size;
     storage_objects.emplace_back(path, size);
 }

From 52db0e5c40cb3137cbdcb9f8c802d1fd60cdc96c Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Wed, 14 Sep 2022 11:52:23 +0200
Subject: [PATCH 71/87] Update DiskObjectStorageMetadata.cpp

---
 src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
index 1853b5b4dd7..56cc20098ba 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@@ -13,7 +13,6 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int UNKNOWN_FORMAT;
-    extern const int LOGICAL_ERROR;
 }
 
 void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf)

From bb5f9e578aba3ed02cca2e671af4a6cabdbc0d69 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Wed, 14 Sep 2022 12:17:24 +0200
Subject: [PATCH 72/87] Update storing-data.md

---
 docs/en/operations/storing-data.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index 546e3d7b7a6..194778400d3 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -166,7 +166,7 @@ Cache **configuration settings**:
 
 - `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it.
 
-- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `true`.
+- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `false`. This setting was added in version 22.8. If you used filesystem cache before this version, then it will not work on versions starting from 22.8 if this setting is set to `true`. If you want to use this setting, clear old cache created before version 22.8 before upgrading.
 
 - `max_file_segment_size` - a maximum size of a single cache file. Default: `104857600` (100 Mb).
 

From cc639f2e2db0ddaeb7cb396abdae86c13b0a8fb2 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Wed, 14 Sep 2022 13:13:02 +0200
Subject: [PATCH 73/87] Update CachedOnDiskReadBufferFromFile.cpp

---
 src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
index fa4a79415ec..d97b669ab10 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
@@ -206,7 +206,7 @@ CachedOnDiskReadBufferFromFile::getRemoteFSReadBuffer(FileSegment & file_segment
                 return remote_file_reader;
 
             auto remote_fs_segment_reader = file_segment.extractRemoteFileReader();
-            if (remote_fs_segment_reader && file_offset_of_buffer_end == remote_file_reader->getFileOffsetOfBufferEnd())
+            if (remote_fs_segment_reader && file_offset_of_buffer_end == remote_fs_segment_reader->getFileOffsetOfBufferEnd())
                 remote_file_reader = remote_fs_segment_reader;
             else
                 remote_file_reader = implementation_buffer_creator();

From 59e7eb084c57be7416d7bc18bf420ccc117580da Mon Sep 17 00:00:00 2001
From: avogar <pav.cruglov@yandex.ru>
Date: Wed, 14 Sep 2022 11:15:10 +0000
Subject: [PATCH 74/87] Add column type check before UUID insertion in MsgPack
 format

---
 src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp         | 4 +++-
 .../0_stateless/02422_msgpack_uuid_wrong_column.reference     | 0
 tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql | 4 ++++
 3 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.reference
 create mode 100644 tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql

diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
index b3d237fecfd..c9978de3ab2 100644
--- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
@@ -235,8 +235,10 @@ static void insertNull(IColumn & column, DataTypePtr type)
     assert_cast<ColumnNullable &>(column).insertDefault();
 }
 
-static void insertUUID(IColumn & column, DataTypePtr /*type*/, const char * value, size_t size)
+static void insertUUID(IColumn & column, DataTypePtr type, const char * value, size_t size)
 {
+    if (!isUUID(type))
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert MessagePack UUID into column with type {}.", type->getName());
     ReadBufferFromMemory buf(value, size);
     UUID uuid;
     readBinaryBigEndian(uuid.toUnderType().items[0], buf);
diff --git a/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.reference b/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql b/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql
new file mode 100644
index 00000000000..4d790354d51
--- /dev/null
+++ b/tests/queries/0_stateless/02422_msgpack_uuid_wrong_column.sql
@@ -0,0 +1,4 @@
+-- Tags: no-parallel, no-fasttest
+
+insert into function file(02422_data.msgpack) select toUUID('f4cdd80d-5d15-4bdc-9527-adcca635ec1f') as uuid settings output_format_msgpack_uuid_representation='ext';
+select * from file(02422_data.msgpack, auto, 'x Int32'); -- {serverError ILLEGAL_COLUMN}

From defd393fdc888ea53f6907e4cc5ac5289ccee24a Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Wed, 14 Sep 2022 14:07:07 +0200
Subject: [PATCH 75/87] Update src/DataTypes/ObjectUtils.h

Co-authored-by: Antonio Andelic <antonio2368@users.noreply.github.com>
---
 src/DataTypes/ObjectUtils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h
index f5ab1813e00..c60d5bec208 100644
--- a/src/DataTypes/ObjectUtils.h
+++ b/src/DataTypes/ObjectUtils.h
@@ -171,7 +171,7 @@ ColumnsDescription getObjectColumns(
     std::unordered_map<String, DataTypes> types_in_entries;
 
     /// Add dummy column for all Object columns
-    /// to not to lost any column if it's missing
+    /// to not lose any column if it's missing
     /// in all entries. If it exists in any entry
     /// dummy column will be removed.
     for (const auto & column : storage_columns)

From 088530b3a20e53030964ff3368a87aba52992482 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 14 Sep 2022 17:53:15 +0300
Subject: [PATCH 76/87] Update README.md

---
 utils/changelog/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/changelog/README.md b/utils/changelog/README.md
index 7ecc727d248..739229b49c9 100644
--- a/utils/changelog/README.md
+++ b/utils/changelog/README.md
@@ -13,7 +13,8 @@ python3 changelog.py -h
 Usage example:
 
 ```
-git fetch --all # changelog.py depends on having the tags available, this will fetch them
+git fetch --tags # changelog.py depends on having the tags available, this will fetch them
+
 python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$GITHUB_TOKEN" v21.6.2.7-prestable
 python3 changelog.py --output=changelog-v22.4.1.2305-prestable.md --gh-user-or-token="$USER" --gh-password="$PASSWORD" v21.6.2.7-prestable
 ```

From cbdbe1077ad6c70cfd41a6071bfe85917285ad1d Mon Sep 17 00:00:00 2001
From: Denny Crane <denis.zhuravlov@gmail.com>
Date: Wed, 14 Sep 2022 12:16:32 -0300
Subject: [PATCH 77/87] Update date-time-functions.md

---
 docs/en/sql-reference/functions/date-time-functions.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index ced96078ce1..c7341cfc13b 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -1223,10 +1223,12 @@ Result:
 └───────────────────────┘
 ```
 
-## FROM\_UNIXTIME
+## FROM_UNIXTIME
 
 Function converts Unix timestamp to a calendar date and a time of a day. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type.
 
+Alias: `fromUnixTimestamp`.
+
 **Example:**
 
 Query:

From c2b02c2ae95d697b63324d9ebe423d903aeb3032 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Wed, 14 Sep 2022 18:23:37 +0200
Subject: [PATCH 78/87] Download ccache from release PRs for backports

---
 tests/ci/build_check.py     |  4 +++-
 tests/ci/ccache_utils.py    | 10 +++++++++-
 tests/ci/fast_test_check.py |  2 +-
 tests/ci/pr_info.py         |  2 +-
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py
index f58c7a74dfe..d668dbe0498 100644
--- a/tests/ci/build_check.py
+++ b/tests/ci/build_check.py
@@ -291,7 +291,9 @@ def main():
 
     logging.info("Will try to fetch cache for our build")
     try:
-        get_ccache_if_not_exists(ccache_path, s3_helper, pr_info.number, TEMP_PATH)
+        get_ccache_if_not_exists(
+            ccache_path, s3_helper, pr_info.number, TEMP_PATH, pr_info.release_pr
+        )
     except Exception as e:
         # In case there are issues with ccache, remove the path and do not fail a build
         logging.info("Failed to get ccache, building without it. Error: %s", e)
diff --git a/tests/ci/ccache_utils.py b/tests/ci/ccache_utils.py
index cfe07363589..864b3a8f9b6 100644
--- a/tests/ci/ccache_utils.py
+++ b/tests/ci/ccache_utils.py
@@ -11,6 +11,7 @@ import requests  # type: ignore
 
 from compress_files import decompress_fast, compress_fast
 from env_helper import S3_DOWNLOAD, S3_BUILDS_BUCKET
+from s3_helper import S3Helper
 
 DOWNLOAD_RETRIES_COUNT = 5
 
@@ -57,12 +58,19 @@ def dowload_file_with_progress(url, path):
 
 
 def get_ccache_if_not_exists(
-    path_to_ccache_dir, s3_helper, current_pr_number, temp_path
+    path_to_ccache_dir: str,
+    s3_helper: S3Helper,
+    current_pr_number: int,
+    temp_path: str,
+    release_pr: int,
 ) -> int:
     """returns: number of PR for downloaded PR. -1 if ccache not found"""
     ccache_name = os.path.basename(path_to_ccache_dir)
     cache_found = False
     prs_to_check = [current_pr_number]
+    # Release PR is either 0 or defined
+    if release_pr:
+        prs_to_check.append(release_pr)
     ccache_pr = -1
     if current_pr_number != 0:
         prs_to_check.append(0)
diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py
index 038289406de..03e42726808 100644
--- a/tests/ci/fast_test_check.py
+++ b/tests/ci/fast_test_check.py
@@ -125,7 +125,7 @@ if __name__ == "__main__":
 
     logging.info("Will try to fetch cache for our build")
     ccache_for_pr = get_ccache_if_not_exists(
-        cache_path, s3_helper, pr_info.number, temp_path
+        cache_path, s3_helper, pr_info.number, temp_path, pr_info.release_pr
     )
     upload_master_ccache = ccache_for_pr in (-1, 0)
 
diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py
index 2acd0e4c811..77421ddac32 100644
--- a/tests/ci/pr_info.py
+++ b/tests/ci/pr_info.py
@@ -86,7 +86,7 @@ class PRInfo:
         self.changed_files = set()  # type: Set[str]
         self.body = ""
         self.diff_urls = []
-        self.release_pr = ""
+        self.release_pr = 0
         ref = github_event.get("ref", "refs/head/master")
         if ref and ref.startswith("refs/heads/"):
             ref = ref[11:]

From 173df9a73a969e46d3d181cf5650b0686a353631 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <tavplubix@gmail.com>
Date: Wed, 14 Sep 2022 20:38:50 +0300
Subject: [PATCH 79/87] Update StorageSystemDDLWorkerQueue.cpp

---
 src/Storages/System/StorageSystemDDLWorkerQueue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp
index 185a6be6a70..67867b6c577 100644
--- a/src/Storages/System/StorageSystemDDLWorkerQueue.cpp
+++ b/src/Storages/System/StorageSystemDDLWorkerQueue.cpp
@@ -206,7 +206,7 @@ static void fillStatusColumns(MutableColumns & res_columns, size_t & col,
 void StorageSystemDDLWorkerQueue::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const
 {
     auto& ddl_worker = context->getDDLWorker();
-    fs::path ddl_zookeeper_path = ddl_worker.getQueueDir();  
+    fs::path ddl_zookeeper_path = ddl_worker.getQueueDir();
     zkutil::ZooKeeperPtr zookeeper = ddl_worker.getAndSetZooKeeper();
     Strings ddl_task_paths = zookeeper->getChildren(ddl_zookeeper_path);
 

From 69d90de9a3739ae8eda4082b4c2fae1af0c88ea1 Mon Sep 17 00:00:00 2001
From: Denny Crane <denis.zhuravlov@gmail.com>
Date: Wed, 14 Sep 2022 15:23:59 -0300
Subject: [PATCH 80/87] Update date-time-functions.md

---
 docs/en/sql-reference/functions/date-time-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index c7341cfc13b..8688f3eb3a0 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -1223,7 +1223,7 @@ Result:
 └───────────────────────┘
 ```
 
-## FROM_UNIXTIME
+## FROM\_UNIXTIME
 
 Function converts Unix timestamp to a calendar date and a time of a day. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type.
 

From 559c696230bb64e5dee247cd7aa109f2f4621aaf Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Wed, 14 Sep 2022 20:29:36 +0200
Subject: [PATCH 81/87] Fix

---
 .../IO/CachedOnDiskReadBufferFromFile.cpp     | 16 ++--
 src/Disks/IO/CachedOnDiskReadBufferFromFile.h |  2 +-
 .../configs/config.d/storage_conf.xml         | 21 +++++
 tests/integration/test_merge_tree_s3/test.py  | 78 ++++++++++++++++++-
 4 files changed, 107 insertions(+), 10 deletions(-)

diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
index fa4a79415ec..e268faebc63 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
@@ -143,9 +143,9 @@ void CachedOnDiskReadBufferFromFile::initialize(size_t offset, size_t size)
 }
 
 CachedOnDiskReadBufferFromFile::ImplementationBufferPtr
-CachedOnDiskReadBufferFromFile::getCacheReadBuffer(size_t offset) const
+CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segment) const
 {
-    auto path = cache->getPathInLocalCache(cache_key, offset, is_persistent);
+    auto path = file_segment.getPathInLocalCache();
 
     ReadSettings local_read_settings{settings};
     /// Do not allow to use asynchronous version of LocalFSReadMethod.
@@ -247,7 +247,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
         if (download_state == FileSegment::State::DOWNLOADED)
         {
             read_type = ReadType::CACHED;
-            return getCacheReadBuffer(range.left);
+            return getCacheReadBuffer(*file_segment);
         }
         else
         {
@@ -280,7 +280,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                     ///                     file_offset_of_buffer_end
 
                     read_type = ReadType::CACHED;
-                    return getCacheReadBuffer(range.left);
+                    return getCacheReadBuffer(*file_segment);
                 }
 
                 download_state = file_segment->wait();
@@ -289,7 +289,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
             case FileSegment::State::DOWNLOADED:
             {
                 read_type = ReadType::CACHED;
-                return getCacheReadBuffer(range.left);
+                return getCacheReadBuffer(*file_segment);
             }
             case FileSegment::State::EMPTY:
             case FileSegment::State::PARTIALLY_DOWNLOADED:
@@ -305,7 +305,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                     ///                     file_offset_of_buffer_end
 
                     read_type = ReadType::CACHED;
-                    return getCacheReadBuffer(range.left);
+                    return getCacheReadBuffer(*file_segment);
                 }
 
                 auto downloader_id = file_segment->getOrSetDownloader();
@@ -323,7 +323,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
 
                         read_type = ReadType::CACHED;
                         file_segment->resetDownloader();
-                        return getCacheReadBuffer(range.left);
+                        return getCacheReadBuffer(*file_segment);
                     }
 
                     if (file_segment->getCurrentWriteOffset() < file_offset_of_buffer_end)
@@ -354,7 +354,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                 if (canStartFromCache(file_offset_of_buffer_end, *file_segment))
                 {
                     read_type = ReadType::CACHED;
-                    return getCacheReadBuffer(range.left);
+                    return getCacheReadBuffer(*file_segment);
                 }
                 else
                 {
diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h
index b86e53ec160..535d01f3a8c 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.h
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.h
@@ -68,7 +68,7 @@ private:
 
     ImplementationBufferPtr getReadBufferForFileSegment(FileSegmentPtr & file_segment);
 
-    ImplementationBufferPtr getCacheReadBuffer(size_t offset) const;
+    ImplementationBufferPtr getCacheReadBuffer(const FileSegment & file_segment) const;
 
     std::optional<size_t> getLastNonDownloadedOffset() const;
 
diff --git a/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml b/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml
index 3ee49744a61..f3505f53339 100644
--- a/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml
+++ b/tests/integration/test_merge_tree_s3/configs/config.d/storage_conf.xml
@@ -38,6 +38,20 @@
                 <path>/jbod1/</path>
                 <max_size>1000000000</max_size>
             </s3_with_cache_and_jbod>
+            <s3_r>
+                <type>s3</type>
+                <endpoint>http://minio1:9001/root/data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+                <s3_max_single_part_upload_size>33554432</s3_max_single_part_upload_size>
+            </s3_r>
+            <s3_cache_r>
+                <type>cache</type>
+                <disk>s3_r</disk>
+                <path>/s3_cache_r/</path>
+                <max_size>1000000000</max_size>
+                <do_not_evict_index_and_mark_files>1</do_not_evict_index_and_mark_files>
+            </s3_cache_r>
         </disks>
         <policies>
             <s3>
@@ -78,6 +92,13 @@
                     </main>
                 </volumes>
             </s3_with_cache_and_jbod>
+            <s3_cache_r>
+                <volumes>
+                    <main>
+                        <disk>s3_cache_r</disk>
+                    </main>
+                </volumes>
+            </s3_cache_r>
         </policies>
     </storage_configuration>
 
diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py
index 9b254b71826..4ce5fd5a069 100644
--- a/tests/integration/test_merge_tree_s3/test.py
+++ b/tests/integration/test_merge_tree_s3/test.py
@@ -6,7 +6,6 @@ import pytest
 from helpers.cluster import ClickHouseCluster
 from helpers.utility import generate_values, replace_config, SafeThread
 
-
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 
 
@@ -36,6 +35,7 @@ def cluster():
                 "/jbod1:size=2M",
             ],
         )
+
         logging.info("Starting cluster...")
         cluster.start()
         logging.info("Cluster started")
@@ -742,3 +742,79 @@ def test_store_cleanup_disk_s3(cluster, node_name):
         "CREATE TABLE s3_test UUID '00000000-1000-4000-8000-000000000001' (n UInt64) Engine=MergeTree() ORDER BY n SETTINGS storage_policy='s3';"
     )
     node.query("INSERT INTO s3_test SELECT 1")
+
+
+@pytest.mark.parametrize("node_name", ["node"])
+def test_cache_setting_compatibility(cluster, node_name):
+    node = cluster.instances[node_name]
+
+    node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
+
+    node.query(
+        "CREATE TABLE s3_test (key UInt32, value String) Engine=MergeTree() ORDER BY key SETTINGS storage_policy='s3_cache_r';"
+    )
+    node.query(
+        "INSERT INTO s3_test SELECT * FROM generateRandom('key UInt32, value String') LIMIT 500"
+    )
+
+    result = node.query("SYSTEM DROP FILESYSTEM CACHE")
+
+    result = node.query(
+        "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'"
+    )
+    assert int(result) == 0
+
+    node.query("SELECT * FROM s3_test")
+
+    result = node.query(
+        "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'"
+    )
+    assert int(result) > 0
+
+    config_path = os.path.join(
+        SCRIPT_DIR,
+        f"./{cluster.instances_dir_name}/node/configs/config.d/storage_conf.xml",
+    )
+
+    replace_config(
+        config_path,
+        "<do_not_evict_index_and_mark_files>1</do_not_evict_index_and_mark_files>",
+        "<do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>",
+    )
+
+    result = node.query("DESCRIBE CACHE 's3_cache_r'")
+    assert result.strip().endswith("1")
+
+    node.restart_clickhouse()
+
+    result = node.query("DESCRIBE CACHE 's3_cache_r'")
+    assert result.strip().endswith("0")
+
+    result = node.query(
+        "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'"
+    )
+    assert int(result) > 0
+
+    node.query("SELECT * FROM s3_test FORMAT Null")
+
+    assert not node.contains_in_log("No such file or directory: Cache info:")
+
+    replace_config(
+        config_path,
+        "<do_not_evict_index_and_mark_files>0</do_not_evict_index_and_mark_files>",
+        "<do_not_evict_index_and_mark_files>1</do_not_evict_index_and_mark_files>",
+    )
+
+    result = node.query(
+        "SELECT count() FROM system.filesystem_cache WHERE cache_path LIKE '%persistent'"
+    )
+    assert int(result) > 0
+
+    node.restart_clickhouse()
+
+    result = node.query("DESCRIBE CACHE 's3_cache_r'")
+    assert result.strip().endswith("1")
+
+    node.query("SELECT * FROM s3_test FORMAT Null")
+
+    assert not node.contains_in_log("No such file or directory: Cache info:")

From 0b1c2c62fd4cb2f7999589308a5d3fcd37ac5551 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Wed, 14 Sep 2022 20:59:45 +0200
Subject: [PATCH 82/87] Update CachedOnDiskReadBufferFromFile.cpp

---
 src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
index e268faebc63..5e7f107144f 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
@@ -145,6 +145,8 @@ void CachedOnDiskReadBufferFromFile::initialize(size_t offset, size_t size)
 CachedOnDiskReadBufferFromFile::ImplementationBufferPtr
 CachedOnDiskReadBufferFromFile::getCacheReadBuffer(const FileSegment & file_segment) const
 {
+    /// Use is_persistent flag from in-memory state of the filesegment,
+    /// because it is consistent with what is written on disk.
     auto path = file_segment.getPathInLocalCache();
 
     ReadSettings local_read_settings{settings};
@@ -237,8 +239,6 @@ bool CachedOnDiskReadBufferFromFile::canStartFromCache(size_t current_offset, co
 CachedOnDiskReadBufferFromFile::ImplementationBufferPtr
 CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & file_segment)
 {
-    auto range = file_segment->range();
-
     auto download_state = file_segment->state();
     LOG_TEST(log, "getReadBufferForFileSegment: {}", file_segment->getInfoForLog());
 

From 780c0e9b2f011d9d91a265500ec7d68947d9c577 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Thu, 15 Sep 2022 00:18:23 +0200
Subject: [PATCH 83/87] Update CachedOnDiskReadBufferFromFile.cpp

---
 src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
index 5e7f107144f..d370a2902a4 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
@@ -339,7 +339,7 @@ CachedOnDiskReadBufferFromFile::getReadBufferForFileSegment(FileSegmentPtr & fil
                         LOG_TEST(log, "Predownload. File segment info: {}", file_segment->getInfoForLog());
                         chassert(file_offset_of_buffer_end > file_segment->getCurrentWriteOffset());
                         bytes_to_predownload = file_offset_of_buffer_end - file_segment->getCurrentWriteOffset();
-                        chassert(bytes_to_predownload < range.size());
+                        chassert(bytes_to_predownload < file_segment->range().size());
                     }
 
                     read_type = ReadType::REMOTE_FS_READ_AND_PUT_IN_CACHE;

From def4eeac70d98859b6f7f96423148192bfe197ed Mon Sep 17 00:00:00 2001
From: Nikita Taranov <nikita.taranov@clickhouse.com>
Date: Thu, 15 Sep 2022 12:27:08 +0200
Subject: [PATCH 84/87] Fix perf tests (#41332)

We run left server two times. If after the first run server will not be properly stopped, we will get `Address already in use: [::]:9001` exception on the second run.
---
 docker/test/performance-comparison/compare.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh
index d3d7084f37f..b0b5ebdb2e2 100755
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@@ -61,7 +61,7 @@ function configure
     cp -rv right/config left ||:
 
     # Start a temporary server to rename the tables
-    while pkill clickhouse-serv; do echo . ; sleep 1 ; done
+    while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
     echo all killed
 
     set -m # Spawn temporary in its own process groups
@@ -88,7 +88,7 @@ function configure
     clickhouse-client --port $LEFT_SERVER_PORT --query "create database test" ||:
     clickhouse-client --port $LEFT_SERVER_PORT --query "rename table datasets.hits_v1 to test.hits" ||:
 
-    while pkill clickhouse-serv; do echo . ; sleep 1 ; done
+    while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
     echo all killed
 
     # Make copies of the original db for both servers. Use hardlinks instead
@@ -106,7 +106,7 @@ function configure
 
 function restart
 {
-    while pkill clickhouse-serv; do echo . ; sleep 1 ; done
+    while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
     echo all killed
 
     # Change the jemalloc settings here.
@@ -1400,7 +1400,7 @@ case "$stage" in
     while env kill -- -$watchdog_pid ; do sleep 1; done
 
     # Stop the servers to free memory for the subsequent query analysis.
-    while pkill clickhouse-serv; do echo . ; sleep 1 ; done
+    while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
     echo Servers stopped.
     ;&
 "analyze_queries")

From 4935a4bf966b724e0750e677d4a73ed23dfd23f4 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Thu, 15 Sep 2022 12:29:22 +0200
Subject: [PATCH 85/87] Update storing-data.md

---
 docs/en/operations/storing-data.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index 194778400d3..6c8901e66c9 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -108,6 +108,7 @@ Example of disk configuration:
                 <current_key_id>1</current_key_id>
             </disk_s3_encrypted>
         </disks>
+            
     </storage_configuration>
 </clickhouse>
 ```
@@ -134,6 +135,13 @@ Example of configuration for versions later or equal to 22.8:
                 <max_size>10000000</max_size>
             </cache>
         </disks>
+        <policies>
+            <volumes>
+                <main>
+                    <disk>cache</disk>
+                </main>
+            </volumes>
+        <policies>
     </storage_configuration>
 ```
 
@@ -151,6 +159,13 @@ Example of configuration for versions earlier than 22.8:
                 <data_cache_size>10000000</data_cache_size>
             </s3>
         </disks>
+        <policies>
+            <volumes>
+                <main>
+                    <disk>s3</disk>
+                </main>
+            </volumes>
+        <policies>
     </storage_configuration>
 ```
 

From 1ec7ce265becb119740b17ea0758d5133e923265 Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Thu, 15 Sep 2022 12:30:02 +0200
Subject: [PATCH 86/87] Update storing-data.md

---
 docs/en/operations/storing-data.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index 6c8901e66c9..663469ef4ae 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -108,7 +108,6 @@ Example of disk configuration:
                 <current_key_id>1</current_key_id>
             </disk_s3_encrypted>
         </disks>
-            
     </storage_configuration>
 </clickhouse>
 ```

From 1b8b2ebed532a2d459f2a80c7bd9d3340de2bbaa Mon Sep 17 00:00:00 2001
From: Denny Crane <denis.zhuravlov@gmail.com>
Date: Thu, 15 Sep 2022 08:41:46 -0300
Subject: [PATCH 87/87] Disable optimize_monotonous_functions_in_order_by by
 default (#41136)

* #40094 disable optimize_monotonous_functions_in_order_by by default

* fix 01576_alias_column_rewrite test

* fix incorrect 02149_read_in_order_fixed_prefix.sql test

* Update 02149_read_in_order_fixed_prefix.sql
---
 src/Core/Settings.h                                    |  2 +-
 ...1321_monotonous_functions_in_order_by_bug.reference |  2 ++
 .../01321_monotonous_functions_in_order_by_bug.sql     |  7 +++++++
 .../queries/0_stateless/01576_alias_column_rewrite.sql |  2 +-
 .../0_stateless/02149_read_in_order_fixed_prefix.sql   | 10 ++++++++--
 5 files changed, 19 insertions(+), 4 deletions(-)
 create mode 100644 tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.reference
 create mode 100644 tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.sql

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index da420079766..8793bbb3011 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -481,7 +481,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \
     M(Bool, optimize_multiif_to_if, true, "Replace 'multiIf' with only one condition to 'if'.", 0) \
     M(Bool, optimize_if_transform_strings_to_enum, false, "Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail.", 0) \
-    M(Bool, optimize_monotonous_functions_in_order_by, true, "Replace monotonous function with its argument in ORDER BY", 0) \
+    M(Bool, optimize_monotonous_functions_in_order_by, false, "Replace monotonous function with its argument in ORDER BY", 0) \
     M(Bool, optimize_functions_to_subcolumns, false, "Transform functions to subcolumns, if possible, to reduce amount of read data. E.g. 'length(arr)' -> 'arr.size0', 'col IS NULL' -> 'col.null' ", 0) \
     M(Bool, optimize_using_constraints, false, "Use constraints for query optimization", 0)                                                                                                                                           \
     M(Bool, optimize_substitute_columns, false, "Use constraints for column substitution", 0)                                                                                                                                         \
diff --git a/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.reference b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.reference
new file mode 100644
index 00000000000..0c720206065
--- /dev/null
+++ b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.reference
@@ -0,0 +1,2 @@
+2020-01-01 01:00:00	1
+2020-01-01 01:00:00	999
diff --git a/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.sql b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.sql
new file mode 100644
index 00000000000..4aa52fe6ae8
--- /dev/null
+++ b/tests/queries/0_stateless/01321_monotonous_functions_in_order_by_bug.sql
@@ -0,0 +1,7 @@
+SELECT
+    toStartOfHour(c1) AS _c1,
+    c2
+FROM values((toDateTime('2020-01-01 01:01:01'), 999), (toDateTime('2020-01-01 01:01:59'), 1))
+ORDER BY
+    _c1 ASC,
+    c2 ASC
diff --git a/tests/queries/0_stateless/01576_alias_column_rewrite.sql b/tests/queries/0_stateless/01576_alias_column_rewrite.sql
index 8424eb11f9b..1f28225bef8 100644
--- a/tests/queries/0_stateless/01576_alias_column_rewrite.sql
+++ b/tests/queries/0_stateless/01576_alias_column_rewrite.sql
@@ -17,7 +17,7 @@ INSERT INTO test_table(timestamp, value) SELECT toDateTime('2020-01-01 12:00:00'
 INSERT INTO test_table(timestamp, value) SELECT toDateTime('2020-01-02 12:00:00'), 1 FROM numbers(10);
 INSERT INTO test_table(timestamp, value) SELECT toDateTime('2020-01-03 12:00:00'), 1 FROM numbers(10);
 
-set optimize_respect_aliases = 1;
+set optimize_respect_aliases = 1, optimize_monotonous_functions_in_order_by = 1;
 SELECT 'test-partition-prune';
 
 SELECT COUNT() = 10 FROM test_table WHERE day = '2020-01-01' SETTINGS max_rows_to_read = 10;
diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql
index 4dfcbb9bf80..44c1c12be35 100644
--- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql
+++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.sql
@@ -56,7 +56,13 @@ ENGINE = MergeTree ORDER BY (toStartOfDay(dt), d);
 INSERT INTO t_read_in_order SELECT toDateTime('2020-10-10 00:00:00') + number, 1 / (number % 100 + 1), number FROM numbers(1000);
 
 EXPLAIN PIPELINE SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 5;
-SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 5;
+SELECT * from (
+  SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 50000000000
+  -- subquery with limit 50000000 to stabilize a test result and prevent order by d pushdown
+) order by d limit 5;
 
 EXPLAIN PIPELINE SELECT toStartOfDay(dt) as date, d FROM t_read_in_order ORDER BY date, round(d) LIMIT 5;
-SELECT toStartOfDay(dt) as date, d FROM t_read_in_order WHERE date = '2020-10-10' ORDER BY round(d) LIMIT 5;
+SELECT * from (
+  SELECT toStartOfDay(dt) as date, d FROM t_read_in_order WHERE date = '2020-10-10' ORDER BY round(d) LIMIT 50000000000
+  -- subquery with limit 50000000 to stabilize a test result and prevent order by d pushdown
+) order by d limit 5;