From 6cd6319ba70945e7ae50447772c57d61e488e72e Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Tue, 9 Jul 2024 15:32:28 +0000
Subject: [PATCH 01/56] Properly convert boolean literals in query tree

---
 src/Interpreters/convertFieldToType.cpp | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp
index 184c263dbdb..d87d4a73e37 100644
--- a/src/Interpreters/convertFieldToType.cpp
+++ b/src/Interpreters/convertFieldToType.cpp
@@ -214,6 +214,10 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID
     }
     else if (type.isValueRepresentedByNumber() && src.getType() != Field::Types::String)
     {
+        /// Bool is not represented in which_type, so we need to type it separately
+        if (isInt64OrUInt64orBoolFieldType(src.getType()) && type.getName() == "Bool")
+            return bool(src.safeGet<bool>());
+
         if (which_type.isUInt8()) return convertNumericType<UInt8>(src, type);
         if (which_type.isUInt16()) return convertNumericType<UInt16>(src, type);
         if (which_type.isUInt32()) return convertNumericType<UInt32>(src, type);

From 294eaaeabd60140d4f4f6ae892b015d6fe4f551c Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Tue, 9 Jul 2024 15:43:06 +0000
Subject: [PATCH 02/56] Test bool in
 gtest_transform_query_for_external_database

---
 .../gtest_transform_query_for_external_database.cpp  | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/Storages/tests/gtest_transform_query_for_external_database.cpp b/src/Storages/tests/gtest_transform_query_for_external_database.cpp
index 6765e112bb9..5a63c118e2d 100644
--- a/src/Storages/tests/gtest_transform_query_for_external_database.cpp
+++ b/src/Storages/tests/gtest_transform_query_for_external_database.cpp
@@ -7,6 +7,7 @@
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeFactory.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/TreeRewriter.h>
 #include <Interpreters/InterpreterSelectQueryAnalyzer.h>
@@ -74,6 +75,7 @@ private:
                 {"a", std::make_shared<DataTypeUInt8>()},
                 {"b", std::make_shared<DataTypeDate>()},
                 {"foo", std::make_shared<DataTypeString>()},
+                {"is_value", DataTypeFactory::instance().get("Bool")},
             }),
         TableWithColumnNamesAndTypes(
             createDBAndTable("table2"),
@@ -411,6 +413,14 @@ TEST(TransformQueryForExternalDatabase, Analyzer)
         R"(SELECT "column" FROM "test"."table")");
 
     check(state, 1, {"column", "apply_id", "apply_type", "apply_status", "create_time", "field", "value", "a", "b", "foo"},
-        "SELECT * FROM table WHERE (column) IN (1)",
+        "SELECT * EXCEPT (is_value) FROM table WHERE (column) IN (1)",
         R"(SELECT "column", "apply_id", "apply_type", "apply_status", "create_time", "field", "value", "a", "b", "foo" FROM "test"."table" WHERE "column" IN (1))");
+
+    check(state, 1, {"is_value"},
+        "SELECT is_value FROM table WHERE is_value = true",
+        R"(SELECT "is_value" FROM "test"."table" WHERE "is_value" = true)");
+
+    check(state, 1, {"is_value"},
+        "SELECT is_value FROM table WHERE is_value = 1",
+        R"(SELECT "is_value" FROM "test"."table" WHERE "is_value" = 1)");
 }

From 46fbc23e093a85f431ac8afdc33f52fe267506c0 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Tue, 9 Jul 2024 17:11:21 +0000
Subject: [PATCH 03/56] update 02952_conjunction_optimization.reference

---
 .../0_stateless/02952_conjunction_optimization.reference        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02952_conjunction_optimization.reference b/tests/queries/0_stateless/02952_conjunction_optimization.reference
index eeadfaae21d..8af0abefd3a 100644
--- a/tests/queries/0_stateless/02952_conjunction_optimization.reference
+++ b/tests/queries/0_stateless/02952_conjunction_optimization.reference
@@ -32,7 +32,7 @@ QUERY id: 0
     FUNCTION id: 5, function_name: and, function_type: ordinary, result_type: Bool
       ARGUMENTS
         LIST id: 6, nodes: 2
-          CONSTANT id: 7, constant_value: UInt64_1, constant_value_type: Bool
+          CONSTANT id: 7, constant_value: Bool_1, constant_value_type: Bool
           FUNCTION id: 8, function_name: notIn, function_type: ordinary, result_type: UInt8
             ARGUMENTS
               LIST id: 9, nodes: 2

From b8944abe0ec73ca386e851ce96d5f7ddaf3d254e Mon Sep 17 00:00:00 2001
From: Ruihang Xia <waynestxia@gmail.com>
Date: Wed, 10 Jul 2024 19:01:52 +0800
Subject: [PATCH 04/56] refactor: avoid unneed calculation in
 SeriesPeriodDetect

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
---
 src/Functions/seriesPeriodDetectFFT.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/Functions/seriesPeriodDetectFFT.cpp b/src/Functions/seriesPeriodDetectFFT.cpp
index 471354235d5..ecf8398bbd5 100644
--- a/src/Functions/seriesPeriodDetectFFT.cpp
+++ b/src/Functions/seriesPeriodDetectFFT.cpp
@@ -153,12 +153,8 @@ public:
             return true;
         }
 
-        std::vector<double> xfreq(spec_len);
         double step = 0.5 / (spec_len - 1);
-        for (size_t i = 0; i < spec_len; ++i)
-            xfreq[i] = i * step;
-
-        auto freq = xfreq[idx];
+        auto freq = idx * step;
 
         period = std::round(1 / freq);
         return true;

From 3cbb3dc55f6582bc8abc7d5683080702080adcd8 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Fri, 26 Jul 2024 08:41:44 +0000
Subject: [PATCH 05/56] Do not spam logs with messages related to connection
 reset by peer

---
 src/Server/HTTP/HTTPServerConnection.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/Server/HTTP/HTTPServerConnection.cpp b/src/Server/HTTP/HTTPServerConnection.cpp
index 047db014560..8eb2ecb1224 100644
--- a/src/Server/HTTP/HTTPServerConnection.cpp
+++ b/src/Server/HTTP/HTTPServerConnection.cpp
@@ -97,6 +97,18 @@ void HTTPServerConnection::run()
         {
             sendErrorResponse(session, Poco::Net::HTTPResponse::HTTP_BAD_REQUEST);
         }
+        catch (const Poco::Net::NetException & e)
+        {
+            /// Do not spam logs with messages related to connection reset by peer.
+            if (e.code() == POCO_ENOTCONN)
+                break;
+
+            if (session.networkException())
+                session.networkException()->rethrow();
+            else
+                throw;
+        }
+
         catch (const Poco::Exception &)
         {
             if (session.networkException())

From 73e71e7b7a476c7073154a112dd12815bbe49bfe Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Fri, 26 Jul 2024 08:45:43 +0000
Subject: [PATCH 06/56] log

---
 src/Server/HTTP/HTTPServerConnection.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Server/HTTP/HTTPServerConnection.cpp b/src/Server/HTTP/HTTPServerConnection.cpp
index 8eb2ecb1224..39e066005b9 100644
--- a/src/Server/HTTP/HTTPServerConnection.cpp
+++ b/src/Server/HTTP/HTTPServerConnection.cpp
@@ -2,6 +2,7 @@
 #include <Server/TCPServer.h>
 
 #include <Poco/Net/NetException.h>
+#include <Common/logger_useful.h>
 
 namespace DB
 {
@@ -101,7 +102,10 @@ void HTTPServerConnection::run()
         {
             /// Do not spam logs with messages related to connection reset by peer.
             if (e.code() == POCO_ENOTCONN)
+            {
+                LOG_DEBUG(LogFrequencyLimiter(getLogger("HTTPServerConnection"), 10), "Connection reset by peer while processing HTTP request: {}", e.message());
                 break;
+            }
 
             if (session.networkException())
                 session.networkException()->rethrow();

From 0ec292a65f190f69c24420d5ca85d5658bffba0a Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Fri, 2 Aug 2024 00:32:01 -0400
Subject: [PATCH 07/56] Revert "Revert "FuzzQuery table function""

This reverts commit ff44b206
---
 .../table-functions/fuzzQuery.md              |  36 ++++
 programs/client/Client.h                      |   5 +-
 src/{Client => Common}/QueryFuzzer.cpp        |  50 ++++--
 src/{Client => Common}/QueryFuzzer.h          |  35 ++--
 src/Storages/StorageFuzzQuery.cpp             | 169 ++++++++++++++++++
 src/Storages/StorageFuzzQuery.h               |  88 +++++++++
 src/Storages/registerStorages.cpp             |   2 +
 src/TableFunctions/TableFunctionFuzzQuery.cpp |  54 ++++++
 src/TableFunctions/TableFunctionFuzzQuery.h   |  42 +++++
 src/TableFunctions/registerTableFunctions.cpp |   1 +
 src/TableFunctions/registerTableFunctions.h   |   1 +
 .../03031_table_function_fuzzquery.reference  |   2 +
 .../03031_table_function_fuzzquery.sql        |  18 ++
 13 files changed, 473 insertions(+), 30 deletions(-)
 create mode 100644 docs/en/sql-reference/table-functions/fuzzQuery.md
 rename src/{Client => Common}/QueryFuzzer.cpp (97%)
 rename src/{Client => Common}/QueryFuzzer.h (91%)
 create mode 100644 src/Storages/StorageFuzzQuery.cpp
 create mode 100644 src/Storages/StorageFuzzQuery.h
 create mode 100644 src/TableFunctions/TableFunctionFuzzQuery.cpp
 create mode 100644 src/TableFunctions/TableFunctionFuzzQuery.h
 create mode 100644 tests/queries/0_stateless/03031_table_function_fuzzquery.reference
 create mode 100644 tests/queries/0_stateless/03031_table_function_fuzzquery.sql

diff --git a/docs/en/sql-reference/table-functions/fuzzQuery.md b/docs/en/sql-reference/table-functions/fuzzQuery.md
new file mode 100644
index 00000000000..e15f8a40156
--- /dev/null
+++ b/docs/en/sql-reference/table-functions/fuzzQuery.md
@@ -0,0 +1,36 @@
+---
+slug: /en/sql-reference/table-functions/fuzzQuery
+sidebar_position: 75
+sidebar_label: fuzzQuery
+---
+
+# fuzzQuery
+
+Perturbs the given query string with random variations.
+
+``` sql
+fuzzQuery(query[, max_query_length[, random_seed]])
+```
+
+**Arguments**
+
+- `query` (String) - The source query to perform the fuzzing on.
+- `max_query_length` (UInt64) - A maximum length the query can get during the fuzzing process.
+- `random_seed` (UInt64) - A random seed for producing stable results.
+
+**Returned Value**
+
+A table object with a single column containing perturbed query strings.
+
+## Usage Example
+
+``` sql
+SELECT * FROM fuzzQuery('SELECT materialize(\'a\' AS key) GROUP BY key') LIMIT 2;
+```
+
+```
+   ┌─query──────────────────────────────────────────────────────────┐
+1. │ SELECT 'a' AS key GROUP BY key                                 │
+2. │ EXPLAIN PIPELINE compact = true SELECT 'a' AS key GROUP BY key │
+   └────────────────────────────────────────────────────────────────┘
+```
diff --git a/programs/client/Client.h b/programs/client/Client.h
index 7fdf77031ab..07a8e293b1a 100644
--- a/programs/client/Client.h
+++ b/programs/client/Client.h
@@ -11,7 +11,10 @@ class Client : public ClientApplicationBase
 public:
     using Arguments = ClientApplicationBase::Arguments;
 
-    Client() = default;
+    Client()
+    {
+        fuzzer = QueryFuzzer(randomSeed(), &std::cout, &std::cerr);
+    }
 
     void initialize(Poco::Util::Application & self) override;
 
diff --git a/src/Client/QueryFuzzer.cpp b/src/Common/QueryFuzzer.cpp
similarity index 97%
rename from src/Client/QueryFuzzer.cpp
rename to src/Common/QueryFuzzer.cpp
index f5b700ea529..161c38f20e0 100644
--- a/src/Client/QueryFuzzer.cpp
+++ b/src/Common/QueryFuzzer.cpp
@@ -68,22 +68,21 @@ Field QueryFuzzer::getRandomField(int type)
     {
     case 0:
     {
-        return bad_int64_values[fuzz_rand() % (sizeof(bad_int64_values)
-                / sizeof(*bad_int64_values))];
+        return bad_int64_values[fuzz_rand() % std::size(bad_int64_values)];
     }
     case 1:
     {
         static constexpr double values[]
                 = {NAN, INFINITY, -INFINITY, 0., -0., 0.0001, 0.5, 0.9999,
                    1., 1.0001, 2., 10.0001, 100.0001, 1000.0001, 1e10, 1e20,
-                  FLT_MIN, FLT_MIN + FLT_EPSILON, FLT_MAX, FLT_MAX + FLT_EPSILON}; return values[fuzz_rand() % (sizeof(values) / sizeof(*values))];
+                  FLT_MIN, FLT_MIN + FLT_EPSILON, FLT_MAX, FLT_MAX + FLT_EPSILON}; return values[fuzz_rand() % std::size(values)];
     }
     case 2:
     {
         static constexpr UInt64 scales[] = {0, 1, 2, 10};
         return DecimalField<Decimal64>(
-            bad_int64_values[fuzz_rand() % (sizeof(bad_int64_values) / sizeof(*bad_int64_values))],
-            static_cast<UInt32>(scales[fuzz_rand() % (sizeof(scales) / sizeof(*scales))])
+            bad_int64_values[fuzz_rand() % std::size(bad_int64_values)],
+            static_cast<UInt32>(scales[fuzz_rand() % std::size(scales)])
         );
     }
     default:
@@ -165,7 +164,8 @@ Field QueryFuzzer::fuzzField(Field field)
         {
             size_t pos = fuzz_rand() % arr.size();
             arr.erase(arr.begin() + pos);
-            std::cerr << "erased\n";
+            if (debug_stream)
+                *debug_stream << "erased\n";
         }
 
         if (fuzz_rand() % 5 == 0)
@@ -174,12 +174,14 @@ Field QueryFuzzer::fuzzField(Field field)
             {
                 size_t pos = fuzz_rand() % arr.size();
                 arr.insert(arr.begin() + pos, fuzzField(arr[pos]));
-                std::cerr << fmt::format("inserted (pos {})\n", pos);
+                if (debug_stream)
+                    *debug_stream << fmt::format("inserted (pos {})\n", pos);
             }
             else
             {
                 arr.insert(arr.begin(), getRandomField(0));
-                std::cerr << "inserted (0)\n";
+                if (debug_stream)
+                    *debug_stream << "inserted (0)\n";
             }
 
         }
@@ -197,7 +199,9 @@ Field QueryFuzzer::fuzzField(Field field)
         {
             size_t pos = fuzz_rand() % arr.size();
             arr.erase(arr.begin() + pos);
-            std::cerr << "erased\n";
+
+            if (debug_stream)
+                *debug_stream << "erased\n";
         }
 
         if (fuzz_rand() % 5 == 0)
@@ -206,12 +210,16 @@ Field QueryFuzzer::fuzzField(Field field)
             {
                 size_t pos = fuzz_rand() % arr.size();
                 arr.insert(arr.begin() + pos, fuzzField(arr[pos]));
-                std::cerr << fmt::format("inserted (pos {})\n", pos);
+
+                if (debug_stream)
+                    *debug_stream << fmt::format("inserted (pos {})\n", pos);
             }
             else
             {
                 arr.insert(arr.begin(), getRandomField(0));
-                std::cerr << "inserted (0)\n";
+
+                if (debug_stream)
+                    *debug_stream << "inserted (0)\n";
             }
 
         }
@@ -344,7 +352,8 @@ void QueryFuzzer::fuzzOrderByList(IAST * ast)
         }
         else
         {
-            std::cerr << "No random column.\n";
+            if (debug_stream)
+                *debug_stream << "No random column.\n";
         }
     }
 
@@ -378,7 +387,8 @@ void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
         if (col)
             impl->children.insert(pos, col);
         else
-            std::cerr << "No random column.\n";
+            if (debug_stream)
+                *debug_stream << "No random column.\n";
     }
 
     // We don't have to recurse here to fuzz the children, this is handled by
@@ -1361,11 +1371,15 @@ void QueryFuzzer::fuzzMain(ASTPtr & ast)
     collectFuzzInfoMain(ast);
     fuzz(ast);
 
-    std::cout << std::endl;
-    WriteBufferFromOStream ast_buf(std::cout, 4096);
-    formatAST(*ast, ast_buf, false /*highlight*/);
-    ast_buf.finalize();
-    std::cout << std::endl << std::endl;
+    if (out_stream)
+    {
+        *out_stream << std::endl;
+
+        WriteBufferFromOStream ast_buf(*out_stream, 4096);
+        formatAST(*ast, ast_buf, false /*highlight*/);
+        ast_buf.finalize();
+        *out_stream << std::endl << std::endl;
+    }
 }
 
 }
diff --git a/src/Client/QueryFuzzer.h b/src/Common/QueryFuzzer.h
similarity index 91%
rename from src/Client/QueryFuzzer.h
rename to src/Common/QueryFuzzer.h
index 6165e589cae..35d088809f2 100644
--- a/src/Client/QueryFuzzer.h
+++ b/src/Common/QueryFuzzer.h
@@ -35,9 +35,31 @@ struct ASTWindowDefinition;
  * queries, so you want to feed it a lot of queries to get some interesting mix
  * of them. Normally we feed SQL regression tests to it.
  */
-struct QueryFuzzer
+class QueryFuzzer
 {
-    pcg64 fuzz_rand{randomSeed()};
+public:
+    explicit QueryFuzzer(pcg64 fuzz_rand_ = randomSeed(), std::ostream * out_stream_ = nullptr, std::ostream * debug_stream_ = nullptr)
+        : fuzz_rand(fuzz_rand_)
+        , out_stream(out_stream_)
+        , debug_stream(debug_stream_)
+    {
+    }
+
+    // This is the only function you have to call -- it will modify the passed
+    // ASTPtr to point to new AST with some random changes.
+    void fuzzMain(ASTPtr & ast);
+
+    ASTs getInsertQueriesForFuzzedTables(const String & full_query);
+    ASTs getDropQueriesForFuzzedTables(const ASTDropQuery & drop_query);
+    void notifyQueryFailed(ASTPtr ast);
+
+    static bool isSuitableForFuzzing(const ASTCreateQuery & create);
+
+private:
+    pcg64 fuzz_rand;
+
+    std::ostream * out_stream = nullptr;
+    std::ostream * debug_stream = nullptr;
 
     // We add elements to expression lists with fixed probability. Some elements
     // are so large, that the expected number of elements we add to them is
@@ -66,10 +88,6 @@ struct QueryFuzzer
     std::unordered_map<std::string, size_t> index_of_fuzzed_table;
     std::set<IAST::Hash> created_tables_hashes;
 
-    // This is the only function you have to call -- it will modify the passed
-    // ASTPtr to point to new AST with some random changes.
-    void fuzzMain(ASTPtr & ast);
-
     // Various helper functions follow, normally you shouldn't have to call them.
     Field getRandomField(int type);
     Field fuzzField(Field field);
@@ -77,9 +95,6 @@ struct QueryFuzzer
     ASTPtr getRandomExpressionList();
     DataTypePtr fuzzDataType(DataTypePtr type);
     DataTypePtr getRandomType();
-    ASTs getInsertQueriesForFuzzedTables(const String & full_query);
-    ASTs getDropQueriesForFuzzedTables(const ASTDropQuery & drop_query);
-    void notifyQueryFailed(ASTPtr ast);
     void replaceWithColumnLike(ASTPtr & ast);
     void replaceWithTableLike(ASTPtr & ast);
     void fuzzOrderByElement(ASTOrderByElement * elem);
@@ -102,8 +117,6 @@ struct QueryFuzzer
     void addTableLike(ASTPtr ast);
     void addColumnLike(ASTPtr ast);
     void collectFuzzInfoRecurse(ASTPtr ast);
-
-    static bool isSuitableForFuzzing(const ASTCreateQuery & create);
 };
 
 }
diff --git a/src/Storages/StorageFuzzQuery.cpp b/src/Storages/StorageFuzzQuery.cpp
new file mode 100644
index 00000000000..6e8f425f8dc
--- /dev/null
+++ b/src/Storages/StorageFuzzQuery.cpp
@@ -0,0 +1,169 @@
+#include <Storages/StorageFuzzQuery.h>
+
+#include <optional>
+#include <unordered_set>
+#include <Columns/ColumnString.h>
+#include <Interpreters/evaluateConstantExpression.h>
+#include <Storages/NamedCollectionsHelpers.h>
+#include <Storages/StorageFactory.h>
+#include <Storages/checkAndGetLiteralArgument.h>
+#include <Parsers/ParserQuery.h>
+#include <Parsers/parseQuery.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+
+ColumnPtr FuzzQuerySource::createColumn()
+{
+    auto column = ColumnString::create();
+    ColumnString::Chars & data_to = column->getChars();
+    ColumnString::Offsets & offsets_to = column->getOffsets();
+
+    offsets_to.resize(block_size);
+    IColumn::Offset offset = 0;
+
+    auto fuzz_base = query;
+    size_t row_num = 0;
+
+    while (row_num < block_size)
+    {
+        ASTPtr new_query = fuzz_base->clone();
+
+        auto base_before_fuzz = fuzz_base->formatForErrorMessage();
+        fuzzer.fuzzMain(new_query);
+        auto fuzzed_text = new_query->formatForErrorMessage();
+
+        if (base_before_fuzz == fuzzed_text)
+            continue;
+
+        /// AST is too long, will start from the original query.
+        if (config.max_query_length > 500)
+        {
+            fuzz_base = query;
+            continue;
+        }
+
+        IColumn::Offset next_offset = offset + fuzzed_text.size() + 1;
+        data_to.resize(next_offset);
+
+        std::copy(fuzzed_text.begin(), fuzzed_text.end(), &data_to[offset]);
+
+        data_to[offset + fuzzed_text.size()] = 0;
+        offsets_to[row_num] = next_offset;
+
+        offset = next_offset;
+        fuzz_base = new_query;
+        ++row_num;
+    }
+
+    return column;
+}
+
+StorageFuzzQuery::StorageFuzzQuery(
+    const StorageID & table_id_, const ColumnsDescription & columns_, const String & comment_, const Configuration & config_)
+    : IStorage(table_id_), config(config_)
+{
+    StorageInMemoryMetadata storage_metadata;
+    storage_metadata.setColumns(columns_);
+    storage_metadata.setComment(comment_);
+    setInMemoryMetadata(storage_metadata);
+}
+
+Pipe StorageFuzzQuery::read(
+    const Names & column_names,
+    const StorageSnapshotPtr & storage_snapshot,
+    SelectQueryInfo & /*query_info*/,
+    ContextPtr /*context*/,
+    QueryProcessingStage::Enum /*processed_stage*/,
+    size_t max_block_size,
+    size_t num_streams)
+{
+    storage_snapshot->check(column_names);
+
+    Pipes pipes;
+    pipes.reserve(num_streams);
+
+    const ColumnsDescription & our_columns = storage_snapshot->metadata->getColumns();
+    Block block_header;
+    for (const auto & name : column_names)
+    {
+        const auto & name_type = our_columns.get(name);
+        MutableColumnPtr column = name_type.type->createColumn();
+        block_header.insert({std::move(column), name_type.type, name_type.name});
+    }
+
+    const char * begin = config.query.data();
+    const char * end = begin + config.query.size();
+
+    ParserQuery parser(end, false);
+    auto query = parseQuery(parser, begin, end, "", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH, DBMS_DEFAULT_MAX_PARSER_BACKTRACKS);
+
+    for (UInt64 i = 0; i < num_streams; ++i)
+        pipes.emplace_back(std::make_shared<FuzzQuerySource>(max_block_size, block_header, config, query));
+
+    return Pipe::unitePipes(std::move(pipes));
+}
+
+StorageFuzzQuery::Configuration StorageFuzzQuery::getConfiguration(ASTs & engine_args, ContextPtr local_context)
+{
+    StorageFuzzQuery::Configuration configuration{};
+
+    // Supported signatures:
+    //
+    // FuzzQuery(query)
+    // FuzzQuery(query, max_query_length)
+    // FuzzQuery(query, max_query_length, random_seed)
+    if (engine_args.empty() || engine_args.size() > 3)
+        throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "FuzzQuery requires 1 to 3 arguments: query, max_query_length, random_seed");
+
+    for (auto & engine_arg : engine_args)
+        engine_arg = evaluateConstantExpressionOrIdentifierAsLiteral(engine_arg, local_context);
+
+    auto first_arg = checkAndGetLiteralArgument<String>(engine_args[0], "query");
+    configuration.query = std::move(first_arg);
+
+    if (engine_args.size() >= 2)
+    {
+        const auto & literal = engine_args[1]->as<const ASTLiteral &>();
+        if (!literal.value.isNull())
+            configuration.max_query_length = checkAndGetLiteralArgument<UInt64>(literal, "max_query_length");
+    }
+
+    if (engine_args.size() == 3)
+    {
+        const auto & literal = engine_args[2]->as<const ASTLiteral &>();
+        if (!literal.value.isNull())
+            configuration.random_seed = checkAndGetLiteralArgument<UInt64>(literal, "random_seed");
+    }
+
+    return configuration;
+}
+
+void registerStorageFuzzQuery(StorageFactory & factory)
+{
+    factory.registerStorage(
+        "FuzzQuery",
+        [](const StorageFactory::Arguments & args) -> std::shared_ptr<StorageFuzzQuery>
+        {
+            ASTs & engine_args = args.engine_args;
+
+            if (engine_args.empty())
+                throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Storage FuzzQuery must have arguments.");
+
+            StorageFuzzQuery::Configuration configuration = StorageFuzzQuery::getConfiguration(engine_args, args.getLocalContext());
+
+            for (const auto& col : args.columns)
+                if (col.type->getTypeId() != TypeIndex::String)
+                    throw Exception(ErrorCodes::BAD_ARGUMENTS, "'StorageFuzzQuery' supports only columns of String type, got {}.", col.type->getName());
+
+            return std::make_shared<StorageFuzzQuery>(args.table_id, args.columns, args.comment, configuration);
+        });
+}
+
+}
diff --git a/src/Storages/StorageFuzzQuery.h b/src/Storages/StorageFuzzQuery.h
new file mode 100644
index 00000000000..125ef960e74
--- /dev/null
+++ b/src/Storages/StorageFuzzQuery.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include <Storages/IStorage.h>
+#include <Storages/StorageConfiguration.h>
+#include <Common/randomSeed.h>
+#include <Common/QueryFuzzer.h>
+
+#include "config.h"
+
+namespace DB
+{
+
+class NamedCollection;
+
+class StorageFuzzQuery final : public IStorage
+{
+public:
+    struct Configuration : public StatelessTableEngineConfiguration
+    {
+        String query;
+        UInt64 max_query_length = 500;
+        UInt64 random_seed = randomSeed();
+    };
+
+    StorageFuzzQuery(
+        const StorageID & table_id_, const ColumnsDescription & columns_, const String & comment_, const Configuration & config_);
+
+    std::string getName() const override { return "FuzzQuery"; }
+
+    Pipe read(
+        const Names & column_names,
+        const StorageSnapshotPtr & storage_snapshot,
+        SelectQueryInfo & query_info,
+        ContextPtr context,
+        QueryProcessingStage::Enum processed_stage,
+        size_t max_block_size,
+        size_t num_streams) override;
+
+    static StorageFuzzQuery::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context);
+
+private:
+    const Configuration config;
+};
+
+
+class FuzzQuerySource : public ISource
+{
+public:
+    FuzzQuerySource(
+        UInt64 block_size_, Block block_header_, const StorageFuzzQuery::Configuration & config_, ASTPtr query_)
+        : ISource(block_header_)
+        , block_size(block_size_)
+        , block_header(std::move(block_header_))
+        , config(config_)
+        , query(query_)
+        , fuzzer(config_.random_seed)
+    {
+    }
+
+    String getName() const override { return "FuzzQuery"; }
+
+protected:
+    Chunk generate() override
+    {
+        Columns columns;
+        columns.reserve(block_header.columns());
+        for (const auto & col : block_header)
+        {
+            chassert(col.type->getTypeId() == TypeIndex::String);
+            columns.emplace_back(createColumn());
+        }
+
+        return {std::move(columns), block_size};
+    }
+
+private:
+    ColumnPtr createColumn();
+
+    UInt64 block_size;
+    Block block_header;
+
+    StorageFuzzQuery::Configuration config;
+    ASTPtr query;
+
+    QueryFuzzer fuzzer;
+};
+
+}
diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp
index 8f33314397c..adc1074b1fe 100644
--- a/src/Storages/registerStorages.cpp
+++ b/src/Storages/registerStorages.cpp
@@ -26,6 +26,7 @@ void registerStorageGenerateRandom(StorageFactory & factory);
 void registerStorageExecutable(StorageFactory & factory);
 void registerStorageWindowView(StorageFactory & factory);
 void registerStorageLoop(StorageFactory & factory);
+void registerStorageFuzzQuery(StorageFactory & factory);
 #if USE_RAPIDJSON || USE_SIMDJSON
 void registerStorageFuzzJSON(StorageFactory & factory);
 #endif
@@ -126,6 +127,7 @@ void registerStorages()
     registerStorageExecutable(factory);
     registerStorageWindowView(factory);
     registerStorageLoop(factory);
+    registerStorageFuzzQuery(factory);
 #if USE_RAPIDJSON || USE_SIMDJSON
     registerStorageFuzzJSON(factory);
 #endif
diff --git a/src/TableFunctions/TableFunctionFuzzQuery.cpp b/src/TableFunctions/TableFunctionFuzzQuery.cpp
new file mode 100644
index 00000000000..224f6666556
--- /dev/null
+++ b/src/TableFunctions/TableFunctionFuzzQuery.cpp
@@ -0,0 +1,54 @@
+#include <TableFunctions/TableFunctionFuzzQuery.h>
+
+#include <DataTypes/DataTypeString.h>
+#include <Storages/checkAndGetLiteralArgument.h>
+#include <TableFunctions/TableFunctionFactory.h>
+#include <TableFunctions/registerTableFunctions.h>
+
+namespace DB
+{
+
+
+namespace ErrorCodes
+{
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+
+void TableFunctionFuzzQuery::parseArguments(const ASTPtr & ast_function, ContextPtr context)
+{
+    ASTs & args_func = ast_function->children;
+
+    if (args_func.size() != 1)
+        throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments", getName());
+
+    auto args = args_func.at(0)->children;
+    configuration = StorageFuzzQuery::getConfiguration(args, context);
+}
+
+StoragePtr TableFunctionFuzzQuery::executeImpl(
+    const ASTPtr & /*ast_function*/,
+    ContextPtr context,
+    const std::string & table_name,
+    ColumnsDescription /*cached_columns*/,
+    bool is_insert_query) const
+{
+    ColumnsDescription columns = getActualTableStructure(context, is_insert_query);
+    auto res = std::make_shared<StorageFuzzQuery>(
+        StorageID(getDatabaseName(), table_name),
+        columns,
+        /* comment */ String{},
+        configuration);
+    res->startup();
+    return res;
+}
+
+void registerTableFunctionFuzzQuery(TableFunctionFactory & factory)
+{
+    factory.registerFunction<TableFunctionFuzzQuery>(
+        {.documentation
+         = {.description = "Perturbs a query string with random variations.",
+            .returned_value = "A table object with a single column containing perturbed query strings."},
+         .allow_readonly = true});
+}
+
+}
diff --git a/src/TableFunctions/TableFunctionFuzzQuery.h b/src/TableFunctions/TableFunctionFuzzQuery.h
new file mode 100644
index 00000000000..22d10341c4d
--- /dev/null
+++ b/src/TableFunctions/TableFunctionFuzzQuery.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <optional>
+
+#include <TableFunctions/ITableFunction.h>
+#include <DataTypes/DataTypeString.h>
+#include <Storages/StorageFuzzQuery.h>
+
+#include "config.h"
+
+namespace DB
+{
+
+class TableFunctionFuzzQuery : public ITableFunction
+{
+public:
+    static constexpr auto name = "fuzzQuery";
+    std::string getName() const override { return name; }
+
+    void parseArguments(const ASTPtr & ast_function, ContextPtr context) override;
+
+    ColumnsDescription getActualTableStructure(ContextPtr /* context */, bool /* is_insert_query */) const override
+    {
+        return ColumnsDescription{{"query", std::make_shared<DataTypeString>()}};
+    }
+
+private:
+    StoragePtr executeImpl(
+        const ASTPtr & ast_function,
+        ContextPtr context,
+        const std::string & table_name,
+        ColumnsDescription cached_columns,
+        bool is_insert_query) const override;
+
+    const char * getStorageTypeName() const override { return "fuzzQuery"; }
+
+    String source;
+    std::optional<UInt64> random_seed;
+    StorageFuzzQuery::Configuration configuration;
+};
+
+}
diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp
index ca4913898f9..a6c90872f12 100644
--- a/src/TableFunctions/registerTableFunctions.cpp
+++ b/src/TableFunctions/registerTableFunctions.cpp
@@ -26,6 +26,7 @@ void registerTableFunctions()
     registerTableFunctionMongoDB(factory);
     registerTableFunctionRedis(factory);
     registerTableFunctionMergeTreeIndex(factory);
+    registerTableFunctionFuzzQuery(factory);
 #if USE_RAPIDJSON || USE_SIMDJSON
     registerTableFunctionFuzzJSON(factory);
 #endif
diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h
index efde4d6dcdc..2a8864a9bfd 100644
--- a/src/TableFunctions/registerTableFunctions.h
+++ b/src/TableFunctions/registerTableFunctions.h
@@ -23,6 +23,7 @@ void registerTableFunctionGenerate(TableFunctionFactory & factory);
 void registerTableFunctionMongoDB(TableFunctionFactory & factory);
 void registerTableFunctionRedis(TableFunctionFactory & factory);
 void registerTableFunctionMergeTreeIndex(TableFunctionFactory & factory);
+void registerTableFunctionFuzzQuery(TableFunctionFactory & factory);
 #if USE_RAPIDJSON || USE_SIMDJSON
 void registerTableFunctionFuzzJSON(TableFunctionFactory & factory);
 #endif
diff --git a/tests/queries/0_stateless/03031_table_function_fuzzquery.reference b/tests/queries/0_stateless/03031_table_function_fuzzquery.reference
new file mode 100644
index 00000000000..202e4557a33
--- /dev/null
+++ b/tests/queries/0_stateless/03031_table_function_fuzzquery.reference
@@ -0,0 +1,2 @@
+query
+String
diff --git a/tests/queries/0_stateless/03031_table_function_fuzzquery.sql b/tests/queries/0_stateless/03031_table_function_fuzzquery.sql
new file mode 100644
index 00000000000..b26096f7f0e
--- /dev/null
+++ b/tests/queries/0_stateless/03031_table_function_fuzzquery.sql
@@ -0,0 +1,18 @@
+
+SELECT * FROM fuzzQuery('SELECT 1', 500, 8956) LIMIT 0 FORMAT TSVWithNamesAndTypes;
+
+SELECT * FROM fuzzQuery('SELECT *
+FROM (
+  SELECT
+    ([toString(number % 2)] :: Array(LowCardinality(String))) AS item_id,
+    count()
+  FROM numbers(3)
+  GROUP BY item_id WITH TOTALS
+) AS l FULL JOIN (
+  SELECT
+    ([toString((number % 2) * 2)] :: Array(String)) AS item_id
+  FROM numbers(3)
+) AS r
+ON l.item_id = r.item_id
+ORDER BY 1,2,3;
+', 500, 8956) LIMIT 10 FORMAT NULL;

From 08bde9cb44dce6fdf069527852faa8ec29a71b10 Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Fri, 2 Aug 2024 18:28:33 -0400
Subject: [PATCH 08/56] fix conflict

---
 src/Client/ClientBase.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h
index 1a23b6b1363..45251aea28a 100644
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@@ -2,7 +2,7 @@
 
 
 #include <Client/Suggest.h>
-#include <Client/QueryFuzzer.h>
+#include <Common/QueryFuzzer.h>
 #include <Common/DNSResolver.h>
 #include <Common/InterruptListener.h>
 #include <Common/ProgressIndication.h>

From d5629655c77653f51e44e1bfa9f8935b000da891 Mon Sep 17 00:00:00 2001
From: jsc0218 <jsc0218@gmail.com>
Date: Sun, 4 Aug 2024 21:52:29 +0000
Subject: [PATCH 09/56] fix

---
 .../MergeTree/MergeTreeIndexBloomFilter.cpp   | 26 ++++++++++++++++++-
 ..._bloom_filter_not_supported_func.reference |  2 ++
 .../03215_bloom_filter_not_supported_func.sql | 14 ++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference
 create mode 100644 tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql

diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
index c6a00751f25..7b873b0e3f2 100644
--- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
@@ -366,7 +366,31 @@ bool MergeTreeIndexConditionBloomFilter::extractAtomFromTree(const RPNBuilderTre
         }
     }
 
-    return traverseFunction(node, out, nullptr /*parent*/);
+    if (node.isFunction())
+    {
+        /// Similar to the logic of KeyCondition, restrict the usage of bloom filter, in case of func like cast(c=1 or c=9999 as Bool).
+        const std::unordered_set<String> atom_map
+        {
+            "equals",
+            "notEquals",
+            "has",
+            "mapContains",
+            "indexOf",
+            "hasAny",
+            "hasAll",
+            "in",
+            "notIn",
+            "globalIn",
+            "globalNotIn"
+        };
+
+        auto func_name = node.toFunctionNode().getFunctionName();
+        if (atom_map.find(func_name) == std::end(atom_map))
+            return false;
+    }
+
+    bool res = traverseFunction(node, out, nullptr /*parent*/);
+    return res;
 }
 
 bool MergeTreeIndexConditionBloomFilter::traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent)
diff --git a/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference b/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference
new file mode 100644
index 00000000000..6ed281c757a
--- /dev/null
+++ b/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference
@@ -0,0 +1,2 @@
+1
+1
diff --git a/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql b/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql
new file mode 100644
index 00000000000..3d094244892
--- /dev/null
+++ b/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql
@@ -0,0 +1,14 @@
+drop table if exists t;
+
+create table t ( 
+  c Int32,
+  index x1 (c) type bloom_filter
+) engine=MergeTree order by c as select 1;
+
+SELECT count() FROM t WHERE cast(c=1 or c=9999 as Bool) 
+settings use_skip_indexes=0;
+
+SELECT count() FROM t WHERE cast(c=1 or c=9999 as Bool) 
+settings use_skip_indexes=1;
+
+drop table t;
\ No newline at end of file

From 11fd263be6e24ee4cdc3a51ac497510c82837fa5 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Thu, 1 Aug 2024 08:35:05 +0000
Subject: [PATCH 10/56] implement DROP DETACHED PARTITION ALL

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Storages/MergeTree/MergeTreeData.cpp                  | 7 +++++--
 .../03203_drop_detached_partition_all.reference           | 5 +++++
 .../0_stateless/03203_drop_detached_partition_all.sql     | 8 ++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/03203_drop_detached_partition_all.reference
 create mode 100644 tests/queries/0_stateless/03203_drop_detached_partition_all.sql

diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 30a4a7caa0f..3d3ae2e63ea 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -6253,10 +6253,13 @@ void MergeTreeData::dropDetached(const ASTPtr & partition, bool part, ContextPtr
     }
     else
     {
-        String partition_id = getPartitionIDFromQuery(partition, local_context);
+        String partition_id;
+        bool all = partition->as<ASTPartition>()->all;
+        if (!all)
+            partition_id = getPartitionIDFromQuery(partition, local_context);
         DetachedPartsInfo detached_parts = getDetachedParts();
         for (const auto & part_info : detached_parts)
-            if (part_info.valid_name && part_info.partition_id == partition_id
+            if (part_info.valid_name && (all || part_info.partition_id == partition_id)
                 && part_info.prefix != "attaching" && part_info.prefix != "deleting")
                 renamed_parts.addPart(part_info.dir_name, "deleting_" + part_info.dir_name, part_info.disk);
     }
diff --git a/tests/queries/0_stateless/03203_drop_detached_partition_all.reference b/tests/queries/0_stateless/03203_drop_detached_partition_all.reference
new file mode 100644
index 00000000000..c0f52d1d898
--- /dev/null
+++ b/tests/queries/0_stateless/03203_drop_detached_partition_all.reference
@@ -0,0 +1,5 @@
+1	1
+2	2
+3	3
+3
+0
diff --git a/tests/queries/0_stateless/03203_drop_detached_partition_all.sql b/tests/queries/0_stateless/03203_drop_detached_partition_all.sql
new file mode 100644
index 00000000000..e29eb4ae36b
--- /dev/null
+++ b/tests/queries/0_stateless/03203_drop_detached_partition_all.sql
@@ -0,0 +1,8 @@
+DROP TABLE IF EXISTS t_03203;
+CREATE TABLE t_03203 (p UInt64, v UInt64) ENGINE = MergeTree PARTITION BY p ORDER BY v;
+INSERT INTO t_03203 VALUES (1, 1), (2, 2), (3, 3);
+SELECT * FROM t_03203 ORDER BY p, v;
+ALTER TABLE t_03203 DETACH PARTITION ALL;
+SELECT count() FROM system.detached_parts WHERE database = currentDatabase() AND table = 't_03203';
+ALTER TABLE t_03203 DROP DETACHED PARTITION ALL SETTINGS allow_drop_detached = 1;
+SELECT count() FROM system.detached_parts WHERE database = currentDatabase() AND table = 't_03203';

From c6c2fce9d22739b1881d5b4814dbbabe8ab8f09b Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Tue, 6 Aug 2024 07:51:35 +0000
Subject: [PATCH 11/56] update document

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 docs/en/sql-reference/statements/alter/partition.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md
index 778816f8934..1bb7817364a 100644
--- a/docs/en/sql-reference/statements/alter/partition.md
+++ b/docs/en/sql-reference/statements/alter/partition.md
@@ -9,6 +9,7 @@ The following operations with [partitions](/docs/en/engines/table-engines/merget
 
 - [DETACH PARTITION\|PART](#detach-partitionpart) — Moves a partition or part to the `detached` directory and forget it.
 - [DROP PARTITION\|PART](#drop-partitionpart) — Deletes a partition or part.
+- [DROP DETACHED PARTITION\|PART](#drop-detached-partitionpart) - Delete a part or all parts of a partition from `detached`.
 - [FORGET PARTITION](#forget-partition) — Deletes a partition metadata from zookeeper if it's empty.
 - [ATTACH PARTITION\|PART](#attach-partitionpart) — Adds a partition or part from the `detached` directory to the table.
 - [ATTACH PARTITION FROM](#attach-partition-from) — Copies the data partition from one table to another and adds.
@@ -68,7 +69,7 @@ ALTER TABLE mt DROP PART 'all_4_4_0';
 ## DROP DETACHED PARTITION\|PART
 
 ``` sql
-ALTER TABLE table_name [ON CLUSTER cluster] DROP DETACHED PARTITION|PART partition_expr
+ALTER TABLE table_name [ON CLUSTER cluster] DROP DETACHED PARTITION|PART ALL|partition_expr
 ```
 
 Removes the specified part or all parts of the specified partition from `detached`.

From 2776a515ba36be6946919bc43ba267797d929cdd Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 6 Aug 2024 12:29:54 +0000
Subject: [PATCH 12/56] Cosmetics I

---
 src/Storages/Statistics/Statistics.cpp               | 12 ++++++------
 src/Storages/Statistics/StatisticsCountMinSketch.cpp |  4 ++--
 src/Storages/Statistics/StatisticsCountMinSketch.h   |  4 ++--
 src/Storages/Statistics/StatisticsTDigest.cpp        |  4 ++--
 src/Storages/Statistics/StatisticsTDigest.h          |  4 ++--
 src/Storages/Statistics/StatisticsUniq.cpp           |  4 ++--
 src/Storages/Statistics/StatisticsUniq.h             |  4 ++--
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp
index ade3326288a..2a17101478a 100644
--- a/src/Storages/Statistics/Statistics.cpp
+++ b/src/Storages/Statistics/Statistics.cpp
@@ -204,15 +204,15 @@ void MergeTreeStatisticsFactory::registerValidator(StatisticsType stats_type, Va
 
 MergeTreeStatisticsFactory::MergeTreeStatisticsFactory()
 {
-    registerValidator(StatisticsType::TDigest, tdigestValidator);
-    registerCreator(StatisticsType::TDigest, tdigestCreator);
+    registerValidator(StatisticsType::TDigest, tdigestStatisticsValidator);
+    registerCreator(StatisticsType::TDigest, tdigestStatisticsCreator);
 
-    registerValidator(StatisticsType::Uniq, uniqValidator);
-    registerCreator(StatisticsType::Uniq, uniqCreator);
+    registerValidator(StatisticsType::Uniq, uniqStatisticsValidator);
+    registerCreator(StatisticsType::Uniq, uniqStatisticsCreator);
 
 #if USE_DATASKETCHES
-    registerValidator(StatisticsType::CountMinSketch, countMinSketchValidator);
-    registerCreator(StatisticsType::CountMinSketch, countMinSketchCreator);
+    registerValidator(StatisticsType::CountMinSketch, countMinSketchStatisticsValidator);
+    registerCreator(StatisticsType::CountMinSketch, countMinSketchStatisticsCreator);
 #endif
 }
 
diff --git a/src/Storages/Statistics/StatisticsCountMinSketch.cpp b/src/Storages/Statistics/StatisticsCountMinSketch.cpp
index e69bbc1515b..50d3b6e515c 100644
--- a/src/Storages/Statistics/StatisticsCountMinSketch.cpp
+++ b/src/Storages/Statistics/StatisticsCountMinSketch.cpp
@@ -84,7 +84,7 @@ void StatisticsCountMinSketch::deserialize(ReadBuffer & buf)
 }
 
 
-void countMinSketchValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
+void countMinSketchStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
 {
     data_type = removeNullable(data_type);
     data_type = removeLowCardinalityAndNullable(data_type);
@@ -92,7 +92,7 @@ void countMinSketchValidator(const SingleStatisticsDescription &, DataTypePtr da
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'count_min' does not support type {}", data_type->getName());
 }
 
-StatisticsPtr countMinSketchCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
+StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
 {
     return std::make_shared<StatisticsCountMinSketch>(stat, data_type);
 }
diff --git a/src/Storages/Statistics/StatisticsCountMinSketch.h b/src/Storages/Statistics/StatisticsCountMinSketch.h
index 6c8b74f8c35..d10bc78a88e 100644
--- a/src/Storages/Statistics/StatisticsCountMinSketch.h
+++ b/src/Storages/Statistics/StatisticsCountMinSketch.h
@@ -31,8 +31,8 @@ private:
 };
 
 
-void countMinSketchValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
-StatisticsPtr countMinSketchCreator(const SingleStatisticsDescription & stat, DataTypePtr);
+void countMinSketchStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
+StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr);
 
 }
 
diff --git a/src/Storages/Statistics/StatisticsTDigest.cpp b/src/Storages/Statistics/StatisticsTDigest.cpp
index 66150e00fdb..7c5ea443201 100644
--- a/src/Storages/Statistics/StatisticsTDigest.cpp
+++ b/src/Storages/Statistics/StatisticsTDigest.cpp
@@ -57,7 +57,7 @@ Float64 StatisticsTDigest::estimateEqual(const Field & val) const
     throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
 }
 
-void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
+void tdigestStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
 {
     data_type = removeNullable(data_type);
     data_type = removeLowCardinalityAndNullable(data_type);
@@ -65,7 +65,7 @@ void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
 }
 
-StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr)
+StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr)
 {
     return std::make_shared<StatisticsTDigest>(stat);
 }
diff --git a/src/Storages/Statistics/StatisticsTDigest.h b/src/Storages/Statistics/StatisticsTDigest.h
index 614973e5d8b..d41b0648aa4 100644
--- a/src/Storages/Statistics/StatisticsTDigest.h
+++ b/src/Storages/Statistics/StatisticsTDigest.h
@@ -23,7 +23,7 @@ private:
     QuantileTDigest<Float64> t_digest;
 };
 
-void tdigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
-StatisticsPtr tdigestCreator(const SingleStatisticsDescription & stat, DataTypePtr);
+void tdigestStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
+StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr);
 
 }
diff --git a/src/Storages/Statistics/StatisticsUniq.cpp b/src/Storages/Statistics/StatisticsUniq.cpp
index 8f60ffcf0b5..c259f09e0ae 100644
--- a/src/Storages/Statistics/StatisticsUniq.cpp
+++ b/src/Storages/Statistics/StatisticsUniq.cpp
@@ -52,7 +52,7 @@ UInt64 StatisticsUniq::estimateCardinality() const
     return column->getUInt(0);
 }
 
-void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
+void uniqStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
 {
     data_type = removeNullable(data_type);
     data_type = removeLowCardinalityAndNullable(data_type);
@@ -60,7 +60,7 @@ void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName());
 }
 
-StatisticsPtr uniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
+StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
 {
     return std::make_shared<StatisticsUniq>(stat, data_type);
 }
diff --git a/src/Storages/Statistics/StatisticsUniq.h b/src/Storages/Statistics/StatisticsUniq.h
index faabde8d47c..743714e1e1b 100644
--- a/src/Storages/Statistics/StatisticsUniq.h
+++ b/src/Storages/Statistics/StatisticsUniq.h
@@ -27,7 +27,7 @@ private:
 
 };
 
-void uniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
-StatisticsPtr uniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type);
+void uniqStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
+StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type);
 
 }

From d09c82ff76186a93b0c521c67ab30d9101bc4769 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 6 Aug 2024 12:33:18 +0000
Subject: [PATCH 13/56] Cosmetics II

---
 .../mergetree-family/mergetree.md             |  2 +-
 src/Storages/Statistics/Statistics.cpp        | 20 ++++++++++---------
 .../Statistics/StatisticsCountMinSketch.cpp   | 10 +++++-----
 .../Statistics/StatisticsCountMinSketch.h     |  6 +++---
 src/Storages/Statistics/StatisticsTDigest.cpp | 10 +++++-----
 src/Storages/Statistics/StatisticsTDigest.h   |  6 +++---
 src/Storages/Statistics/StatisticsUniq.cpp    | 10 +++++-----
 src/Storages/Statistics/StatisticsUniq.h      |  6 +++---
 8 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md
index 7ffbd9a5bae..183b94f4641 100644
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@@ -1005,7 +1005,7 @@ They can be used for prewhere optimization only if we enable `set allow_statisti
 
 ## Column-level Settings {#column-level-settings}
 
-Certain MergeTree settings can be override at column level:
+Certain MergeTree settings can be overridden at column level:
 
 - `max_compress_block_size` — Maximum size of blocks of uncompressed data before compressing for writing to a table.
 - `min_compress_block_size` — Minimum size of blocks of uncompressed data required for compression when writing the next mark.
diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp
index 2a17101478a..771304405a6 100644
--- a/src/Storages/Statistics/Statistics.cpp
+++ b/src/Storages/Statistics/Statistics.cpp
@@ -89,15 +89,17 @@ Float64 IStatistics::estimateLess(const Field & /*val*/) const
     throw Exception(ErrorCodes::LOGICAL_ERROR, "Less-than estimation is not implemented for this type of statistics");
 }
 
-/// -------------------------------------
-/// Implementation of the estimation:
-/// Note: Each statistics object supports certain types predicates natively, e.g.
-/// - TDigest: '< X' (less-than predicates)
-/// - Count-min sketches: '= X' (equal predicates)
-/// - Uniq (HyperLogLog): 'count distinct(*)' (column cardinality)
-/// If multiple statistics objects are available per column, it is sometimes also possible to combine them in a clever way.
-/// For that reason, all estimation are performed in a central place (here), and we don't simply pass the predicate to the first statistics
-/// object that supports it natively.
+/// Notes:
+/// - Statistics object usually only support estimation for certain types of predicates, e.g.
+///    - TDigest: '< X' (less-than predicates)
+///    - Count-min sketches: '= X' (equal predicates)
+///    - Uniq (HyperLogLog): 'count distinct(*)' (column cardinality)
+///
+/// If multiple statistics objects in a column support estimating a predicate, we want to try statistics in order of descending accuracy
+/// (e.g. MinMax statistics are simpler than TDigest statistics and thus worse for estimating 'less' predicates).
+///
+/// Sometimes, it is possible to combine multiple statistics in a clever way. For that reason, all estimation are performed in a central
+/// place (here), and we don't simply pass the predicate to the first statistics object that supports it natively.
 
 Float64 ColumnStatistics::estimateLess(const Field & val) const
 {
diff --git a/src/Storages/Statistics/StatisticsCountMinSketch.cpp b/src/Storages/Statistics/StatisticsCountMinSketch.cpp
index 50d3b6e515c..dce5b39ae56 100644
--- a/src/Storages/Statistics/StatisticsCountMinSketch.cpp
+++ b/src/Storages/Statistics/StatisticsCountMinSketch.cpp
@@ -25,8 +25,8 @@ extern const int ILLEGAL_STATISTICS;
 static constexpr auto num_hashes = 7uz;
 static constexpr auto num_buckets = 2718uz;
 
-StatisticsCountMinSketch::StatisticsCountMinSketch(const SingleStatisticsDescription & stat_, DataTypePtr data_type_)
-    : IStatistics(stat_)
+StatisticsCountMinSketch::StatisticsCountMinSketch(const SingleStatisticsDescription & description, DataTypePtr data_type_)
+    : IStatistics(description)
     , sketch(num_hashes, num_buckets)
     , data_type(data_type_)
 {
@@ -84,7 +84,7 @@ void StatisticsCountMinSketch::deserialize(ReadBuffer & buf)
 }
 
 
-void countMinSketchStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
+void countMinSketchStatisticsValidator(const SingleStatisticsDescription & /*description*/, DataTypePtr data_type)
 {
     data_type = removeNullable(data_type);
     data_type = removeLowCardinalityAndNullable(data_type);
@@ -92,9 +92,9 @@ void countMinSketchStatisticsValidator(const SingleStatisticsDescription &, Data
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'count_min' does not support type {}", data_type->getName());
 }
 
-StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
+StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type)
 {
-    return std::make_shared<StatisticsCountMinSketch>(stat, data_type);
+    return std::make_shared<StatisticsCountMinSketch>(description, data_type);
 }
 
 }
diff --git a/src/Storages/Statistics/StatisticsCountMinSketch.h b/src/Storages/Statistics/StatisticsCountMinSketch.h
index d10bc78a88e..af01408f2a3 100644
--- a/src/Storages/Statistics/StatisticsCountMinSketch.h
+++ b/src/Storages/Statistics/StatisticsCountMinSketch.h
@@ -14,7 +14,7 @@ namespace DB
 class StatisticsCountMinSketch : public IStatistics
 {
 public:
-    StatisticsCountMinSketch(const SingleStatisticsDescription & stat_, DataTypePtr data_type_);
+    StatisticsCountMinSketch(const SingleStatisticsDescription & description, DataTypePtr data_type_);
 
     Float64 estimateEqual(const Field & val) const override;
 
@@ -31,8 +31,8 @@ private:
 };
 
 
-void countMinSketchStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
-StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr);
+void countMinSketchStatisticsValidator(const SingleStatisticsDescription & description, DataTypePtr data_type);
+StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type);
 
 }
 
diff --git a/src/Storages/Statistics/StatisticsTDigest.cpp b/src/Storages/Statistics/StatisticsTDigest.cpp
index 7c5ea443201..73ab6c84b4e 100644
--- a/src/Storages/Statistics/StatisticsTDigest.cpp
+++ b/src/Storages/Statistics/StatisticsTDigest.cpp
@@ -10,8 +10,8 @@ extern const int ILLEGAL_STATISTICS;
 extern const int LOGICAL_ERROR;
 }
 
-StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & stat_)
-    : IStatistics(stat_)
+StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & description)
+    : IStatistics(description)
 {
 }
 
@@ -57,7 +57,7 @@ Float64 StatisticsTDigest::estimateEqual(const Field & val) const
     throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
 }
 
-void tdigestStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
+void tdigestStatisticsValidator(const SingleStatisticsDescription & /*description*/, DataTypePtr data_type)
 {
     data_type = removeNullable(data_type);
     data_type = removeLowCardinalityAndNullable(data_type);
@@ -65,9 +65,9 @@ void tdigestStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
 }
 
-StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr)
+StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr /*data_type*/)
 {
-    return std::make_shared<StatisticsTDigest>(stat);
+    return std::make_shared<StatisticsTDigest>(description);
 }
 
 }
diff --git a/src/Storages/Statistics/StatisticsTDigest.h b/src/Storages/Statistics/StatisticsTDigest.h
index d41b0648aa4..47d6c93f64c 100644
--- a/src/Storages/Statistics/StatisticsTDigest.h
+++ b/src/Storages/Statistics/StatisticsTDigest.h
@@ -9,7 +9,7 @@ namespace DB
 class StatisticsTDigest : public IStatistics
 {
 public:
-    explicit StatisticsTDigest(const SingleStatisticsDescription & stat_);
+    explicit StatisticsTDigest(const SingleStatisticsDescription & description);
 
     void update(const ColumnPtr & column) override;
 
@@ -23,7 +23,7 @@ private:
     QuantileTDigest<Float64> t_digest;
 };
 
-void tdigestStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
-StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr);
+void tdigestStatisticsValidator(const SingleStatisticsDescription & description, DataTypePtr data_type);
+StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type);
 
 }
diff --git a/src/Storages/Statistics/StatisticsUniq.cpp b/src/Storages/Statistics/StatisticsUniq.cpp
index c259f09e0ae..e737f9987a5 100644
--- a/src/Storages/Statistics/StatisticsUniq.cpp
+++ b/src/Storages/Statistics/StatisticsUniq.cpp
@@ -11,8 +11,8 @@ namespace ErrorCodes
     extern const int ILLEGAL_STATISTICS;
 }
 
-StatisticsUniq::StatisticsUniq(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type)
-    : IStatistics(stat_)
+StatisticsUniq::StatisticsUniq(const SingleStatisticsDescription & description, const DataTypePtr & data_type)
+    : IStatistics(description)
 {
     arena = std::make_unique<Arena>();
     AggregateFunctionProperties properties;
@@ -52,7 +52,7 @@ UInt64 StatisticsUniq::estimateCardinality() const
     return column->getUInt(0);
 }
 
-void uniqStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type)
+void uniqStatisticsValidator(const SingleStatisticsDescription & /*description*/, DataTypePtr data_type)
 {
     data_type = removeNullable(data_type);
     data_type = removeLowCardinalityAndNullable(data_type);
@@ -60,9 +60,9 @@ void uniqStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr da
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName());
 }
 
-StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type)
+StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type)
 {
-    return std::make_shared<StatisticsUniq>(stat, data_type);
+    return std::make_shared<StatisticsUniq>(description, data_type);
 }
 
 }
diff --git a/src/Storages/Statistics/StatisticsUniq.h b/src/Storages/Statistics/StatisticsUniq.h
index 743714e1e1b..6b511d4f496 100644
--- a/src/Storages/Statistics/StatisticsUniq.h
+++ b/src/Storages/Statistics/StatisticsUniq.h
@@ -10,7 +10,7 @@ namespace DB
 class StatisticsUniq : public IStatistics
 {
 public:
-    StatisticsUniq(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type);
+    StatisticsUniq(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
     ~StatisticsUniq() override;
 
     void update(const ColumnPtr & column) override;
@@ -27,7 +27,7 @@ private:
 
 };
 
-void uniqStatisticsValidator(const SingleStatisticsDescription &, DataTypePtr data_type);
-StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type);
+void uniqStatisticsValidator(const SingleStatisticsDescription & description, DataTypePtr data_type);
+StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type);
 
 }

From f3ee25036f9c5796a9018699d575f94bf75a50b5 Mon Sep 17 00:00:00 2001
From: Vasily Nemkov <V.Nemkov@gmail.com>
Date: Thu, 2 May 2024 17:39:54 +0000
Subject: [PATCH 14/56] Building aarch64 builds with '-no-pie' to allow better
 introspection

---
 CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f796e6c4616..0d862b23e3a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -428,12 +428,14 @@ if (NOT SANITIZE)
     set (CMAKE_POSITION_INDEPENDENT_CODE OFF)
 endif()
 
-if (OS_LINUX AND NOT (ARCH_AARCH64 OR ARCH_S390X) AND NOT SANITIZE)
+if (NOT OS_ANDROID AND OS_LINUX AND NOT ARCH_S390X AND NOT SANITIZE)
     # Slightly more efficient code can be generated
-    # It's disabled for ARM because otherwise ClickHouse cannot run on Android.
+    # Disabled for Android, because otherwise ClickHouse cannot run on Android.
     set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-pie")
     set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fno-pie")
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -no-pie -Wl,-no-pie")
+else ()
+    message (WARNING "ClickHouse is built as PIE, system.trace_log will contain invalid addresses after server restart.")
 endif ()
 
 if (ENABLE_TESTS)

From ea1575f60aa41d62c3d22211d8dfb5e187b2194e Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sat, 3 Aug 2024 21:14:04 +0200
Subject: [PATCH 15/56] tests: avoid leaving processes leftovers

Previously processes cleanup on i.e. SIGINT simply did not work, because
the launcher kills only processes in process group, while tests are
launched with start_new_session=True for Popen(), which creates own
process group.

This is needed for killing process group in case of test timeout.

So instead, look at the parent pid, and kill the child process groups.

Also add some logging to make it more explicit which processes will be
killed.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 99 +++++++++++++++++++++++++++++++------------
 1 file changed, 71 insertions(+), 28 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 907d773337a..5e70b37e232 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -358,14 +358,78 @@ def clickhouse_execute_json(
     return rows
 
 
+def kill_process_group(pgid):
+    print(f"Killing process group {pgid}")
+    print(f"Processes in process group {pgid}:")
+    print(
+        subprocess.check_output(
+            f"pgrep --pgroup {pgid} -a", shell=True, stderr=subprocess.STDOUT
+        ).decode("utf-8"),
+        end="",
+    )
+    try:
+        # NOTE: this still may leave some processes, that had been
+        # created by timeout(1), since it also creates new process
+        # group. But this should not be a problem with default
+        # options, since the default time for each test is 10min,
+        # and this is way more bigger then the timeout for each
+        # timeout(1) invocation.
+        #
+        # But as a workaround we are sending SIGTERM first, and
+        # only after SIGKILL, that way timeout(1) will have an
+        # ability to terminate childrens (though not always since
+        # signals are asynchronous).
+        os.killpg(pgid, signal.SIGTERM)
+        # This may not be enough, but this is at least something
+        # (and anyway it is OK to spend 0.1 second more in case of
+        # test timeout).
+        sleep(0.1)
+        os.killpg(pgid, signal.SIGKILL)
+    except OSError as e:
+        if e.errno == ESRCH:
+            print(f"Got ESRCH while killing {pgid}. Ignoring.")
+        else:
+            raise
+    print(f"Process group {pgid} should be killed")
+
+
+def cleanup_child_processes(pid):
+    pgid = os.getpgid(os.getpid())
+    print(f"Child processes of {pid}:")
+    print(
+        subprocess.check_output(
+            f"pgrep --parent {pid} -a", shell=True, stderr=subprocess.STDOUT
+        ).decode("utf-8"),
+        end="",
+    )
+    # Due to start_new_session=True, it is not enough to kill by PGID, we need
+    # to look at children processes as well.
+    # But we are hoping that nobody create session in the tests (though it is
+    # possible via timeout(), but we assuming that they will be killed by
+    # timeout).
+    processes = subprocess.check_output(
+        f"pgrep --parent {pid}", shell=True, stderr=subprocess.STDOUT
+    )
+    processes = processes.decode("utf-8")
+    processes = processes.strip()
+    processes = processes.split("\n")
+    processes = map(lambda x: int(x.strip()), processes)
+    processes = list(processes)
+    for child in processes:
+        child_pgid = os.getpgid(child)
+        if child_pgid != pgid:
+            kill_process_group(child_pgid)
+
+    # SIGKILL should not be sent, since this will kill the script itself
+    os.killpg(pgid, signal.SIGTERM)
+
+
+# send signal to all processes in group to avoid hung check triggering
+# (to avoid terminating clickhouse-test itself, the signal should be ignored)
 def stop_tests():
-    # send signal to all processes in group to avoid hung check triggering
-    # (to avoid terminating clickhouse-test itself, the signal should be ignored)
-    print("Sending signals")
     signal.signal(signal.SIGTERM, signal.SIG_IGN)
-    os.killpg(os.getpgid(os.getpid()), signal.SIGTERM)
-    signal.signal(signal.SIGTERM, signal.SIG_DFL)
-    print("Sending signals DONE")
+    cleanup_child_processes(os.getpid())
+    signal.signal(signal.SIGTERM, signal_handler)
 
 
 def get_db_engine(args, database_name):
@@ -1258,28 +1322,7 @@ class TestCase:
 
         if proc:
             if proc.returncode is None:
-                try:
-                    pgid = os.getpgid(proc.pid)
-                    # NOTE: this still may leave some processes, that had been
-                    # created by timeout(1), since it also creates new process
-                    # group. But this should not be a problem with default
-                    # options, since the default time for each test is 10min,
-                    # and this is way more bigger then the timeout for each
-                    # timeout(1) invocation.
-                    #
-                    # But as a workaround we are sending SIGTERM first, and
-                    # only after SIGKILL, that way timeout(1) will have an
-                    # ability to terminate childrens (though not always since
-                    # signals are asynchronous).
-                    os.killpg(pgid, signal.SIGTERM)
-                    # This may not be enough, but this is at least something
-                    # (and anyway it is OK to spend 0.1 second more in case of
-                    # test timeout).
-                    sleep(0.1)
-                    os.killpg(pgid, signal.SIGKILL)
-                except OSError as e:
-                    if e.errno != ESRCH:
-                        raise
+                kill_process_group(os.getpgid(proc.pid))
 
                 if stderr:
                     description += stderr

From f9dcce6da3b9468abf5cdc27915c4c093cd231d7 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sat, 3 Aug 2024 21:23:37 +0200
Subject: [PATCH 16/56] tests: omit python stacktace in case of signals/server
 died

It is simply useless and only create output that only distracts.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 5e70b37e232..10a537f665d 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -3438,11 +3438,12 @@ def parse_args():
 
 
 class Terminated(KeyboardInterrupt):
-    pass
+    def __init__(self, signal):
+        self.signal = signal
 
 
-def signal_handler(sig, frame):
-    raise Terminated(f"Terminated with {sig} signal")
+def signal_handler(signal, frame):
+    raise Terminated(signal)
 
 
 if __name__ == "__main__":
@@ -3594,4 +3595,14 @@ if __name__ == "__main__":
     if args.replace_replicated_with_shared:
         args.s3_storage = True
 
-    main(args)
+    try:
+        main(args)
+    except ServerDied as e:
+        print(f"{e}", file=sys.stderr)
+        sys.exit(1)
+    except Terminated as e:
+        print(f"Terminated with {e.signal} signal", file=sys.stderr)
+        sys.exit(128 + e.signal)
+    except KeyboardInterrupt:
+        print("Interrupted")
+        sys.exit(128 + signal.SIGINT)

From a478ad24a96b28a5cab77c01e77d5d510cddfabb Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sat, 3 Aug 2024 21:26:15 +0200
Subject: [PATCH 17/56] tests: try to catch stacktraces from client in case of
 test timeouts

This is to catch issues like [1].

  [1]: https://github.com/ClickHouse/ClickHouse/issues/67736

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 src/Common/SignalHandlers.cpp |  1 +
 tests/clickhouse-test         | 20 +++++++++++++++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/Common/SignalHandlers.cpp b/src/Common/SignalHandlers.cpp
index e025e49e0a3..c4358da2453 100644
--- a/src/Common/SignalHandlers.cpp
+++ b/src/Common/SignalHandlers.cpp
@@ -629,6 +629,7 @@ void HandledSignals::setupTerminateHandler()
 void HandledSignals::setupCommonDeadlySignalHandlers()
 {
     /// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.
+    /// NOTE: that it is also used by clickhouse-test wrapper
     addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP, SIGTRAP}, signalHandler, true);
 
 #if defined(SANITIZER)
diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 10a537f665d..20e0ce0b150 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -368,6 +368,13 @@ def kill_process_group(pgid):
         end="",
     )
     try:
+        # Let's try to dump stacktrace in client (useful to catch issues there)
+        os.killpg(pgid, signal.SIGTSTP)
+        # Wait some time for clickhouse utilities to gather stacktrace
+        if RELEASE_BUILD:
+            sleep(0.5)
+        else:
+            sleep(5)
         # NOTE: this still may leave some processes, that had been
         # created by timeout(1), since it also creates new process
         # group. But this should not be a problem with default
@@ -380,9 +387,8 @@ def kill_process_group(pgid):
         # ability to terminate childrens (though not always since
         # signals are asynchronous).
         os.killpg(pgid, signal.SIGTERM)
-        # This may not be enough, but this is at least something
-        # (and anyway it is OK to spend 0.1 second more in case of
-        # test timeout).
+        # We need minimal delay to let processes handle SIGTERM - 0.1 (this may
+        # not be enough, but at least something)
         sleep(0.1)
         os.killpg(pgid, signal.SIGKILL)
     except OSError as e:
@@ -2396,7 +2402,13 @@ class BuildFlags:
     POLYMORPHIC_PARTS = "polymorphic-parts"
 
 
+# Release and non-sanitizer build
+RELEASE_BUILD = False
+
+
 def collect_build_flags(args):
+    global RELEASE_BUILD
+
     result = []
 
     value = clickhouse_execute(
@@ -2421,6 +2433,8 @@ def collect_build_flags(args):
     elif b"RelWithDebInfo" in value or b"Release" in value:
         result.append(BuildFlags.RELEASE)
 
+    RELEASE_BUILD = result == [BuildFlags.RELEASE]
+
     value = clickhouse_execute(
         args,
         "SELECT value FROM system.settings WHERE name = 'allow_deprecated_database_ordinary'",

From a6ccf1986936b3cd65dd016d7259e25eb35f35d9 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sat, 3 Aug 2024 21:55:03 +0200
Subject: [PATCH 18/56] tests: capture stderr/stdout/debuglog after terminating
 test

It was simply wrong before, but now, with capturing stacktrace that can
take sometime it is a must.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 67 +++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 20e0ce0b150..a8c8b3614c8 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -1318,18 +1318,35 @@ class TestCase:
 
         return None
 
-    def process_result_impl(
-        self, proc, stdout: str, stderr: str, debug_log: str, total_time: float
-    ):
+    def process_result_impl(self, proc, total_time: float):
+        if proc:
+            if proc.returncode is None:
+                kill_process_group(os.getpgid(proc.pid))
+
         description = ""
 
+        debug_log = ""
+        if os.path.exists(self.testcase_args.debug_log_file):
+            with open(self.testcase_args.debug_log_file, "rb") as stream:
+                debug_log += self.testcase_args.debug_log_file + ":\n"
+                debug_log += str(stream.read(), errors="replace", encoding="utf-8")
+                debug_log += "\n"
+
+        stdout = ""
+        if os.path.exists(self.stdout_file):
+            with open(self.stdout_file, "rb") as stdfd:
+                stdout = str(stdfd.read(), errors="replace", encoding="utf-8")
+
+        stderr = ""
+        if os.path.exists(self.stderr_file):
+            with open(self.stderr_file, "rb") as stdfd:
+                stderr += str(stdfd.read(), errors="replace", encoding="utf-8")
+
         if debug_log:
             debug_log = "\n".join(debug_log.splitlines()[:100])
 
         if proc:
             if proc.returncode is None:
-                kill_process_group(os.getpgid(proc.pid))
-
                 if stderr:
                     description += stderr
                 if debug_log:
@@ -1658,13 +1675,6 @@ class TestCase:
             # Whether the test timed out will be decided later
             pass
 
-        debug_log = ""
-        if os.path.exists(self.testcase_args.debug_log_file):
-            with open(self.testcase_args.debug_log_file, "rb") as stream:
-                debug_log += self.testcase_args.debug_log_file + ":\n"
-                debug_log += str(stream.read(), errors="replace", encoding="utf-8")
-                debug_log += "\n"
-
         total_time = (datetime.now() - start_time).total_seconds()
 
         # Normalize randomized database names in stdout, stderr files.
@@ -1716,17 +1726,7 @@ class TestCase:
                 "https://localhost:8443/",
             )
 
-        stdout = ""
-        if os.path.exists(self.stdout_file):
-            with open(self.stdout_file, "rb") as stdfd:
-                stdout = str(stdfd.read(), errors="replace", encoding="utf-8")
-
-        stderr = ""
-        if os.path.exists(self.stderr_file):
-            with open(self.stderr_file, "rb") as stdfd:
-                stderr += str(stdfd.read(), errors="replace", encoding="utf-8")
-
-        return proc, stdout, stderr, debug_log, total_time
+        return proc, total_time
 
     def run(self, args, suite, client_options, server_logs_level):
         start_time = datetime.now()
@@ -1758,14 +1758,14 @@ class TestCase:
             if not is_valid_utf_8(self.case_file) or (
                 self.reference_file and not is_valid_utf_8(self.reference_file)
             ):
-                proc, stdout, stderr, debug_log, total_time = self.run_single_test(
+                proc, total_time = self.run_single_test(
                     server_logs_level, client_options
                 )
 
-                result = self.process_result_impl(
-                    proc, stdout, stderr, debug_log, total_time
+                result = self.process_result_impl(proc, total_time)
+                result.check_if_need_retry(
+                    args, result.description, result.description, self.runs_count
                 )
-                result.check_if_need_retry(args, stdout, stderr, self.runs_count)
                 # to avoid breaking CSV parser
                 result.description = result.description.replace("\0", "")
             else:
@@ -1783,17 +1783,16 @@ class TestCase:
                     ):
                         (
                             proc,
-                            stdout,
-                            stderr,
-                            debug_log,
                             total_time,
                         ) = self.run_single_test(server_logs_level, client_options)
 
-                        result = self.process_result_impl(
-                            proc, stdout, stderr, debug_log, total_time
-                        )
+                        result = self.process_result_impl(proc, total_time)
+
                         result.check_if_need_retry(
-                            args, stdout, stderr, self.runs_count
+                            args,
+                            result.description,
+                            result.description,
+                            self.runs_count,
                         )
                         # to avoid breaking CSV parser
                         result.description = result.description.replace("\0", "")

From b76fb165d11a7d39b36b8f4e13355c2488ab9e58 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sat, 3 Aug 2024 22:13:29 +0200
Subject: [PATCH 19/56] tests: fix pylint issue in clickhouse_execute_http()

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index a8c8b3614c8..239375d7fec 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -267,7 +267,7 @@ def clickhouse_execute_http(
     max_http_retries=5,
     retry_error_codes=False,
 ):
-    if args.secure:
+    if base_args.secure:
         client = http.client.HTTPSConnection(
             host=base_args.tcp_host, port=base_args.http_port, timeout=timeout
         )

From ef7d12db6625b0a4e2cdf7ae5b6eb192e5d773af Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Mon, 5 Aug 2024 19:51:30 +0200
Subject: [PATCH 20/56] tests: change the process group earlier to avoid
 killing self

Previously it was possible to have original pgid from the spawned
threads, that could lead to killing the caller script and in case of CI
it could be init process [1].

  [1]: https://s3.amazonaws.com/clickhouse-test-reports/67737/e68c9c8d16f37f6c25739076c9b071ed97952269/stress_test__asan_/stress_test_run_21.txt

Repro:

    $ echo "SELECT '1" > tests/queries/0_stateless/00001_select_1.sql # break the test
    $ cat /tmp/test.sh
    ./tests/clickhouse-test 0001_select --test-runs 3 --max-failures-chain 1 --no-random-settings --no-random-merge-tree-settings

Before this change:

    $ /tmp/test.sh
    Using queries from '/src/ch/worktrees/clickhouse-upstream/tests/queries' directory
    Connecting to ClickHouse server... OK
    Connected to server 24.8.1.1 @ bef896ce143ea4e0464c9829de6277ba06cc1a53 mt/rename-without-lock-v2
    Running 3 stateless tests (MainProcess).
    00001_select_1:                                                         [ FAIL ]
    Reason: return code:  62
    Code: 62. DB::Exception: Syntax error: failed at position 8 (''1;
    '): '1;
    . Single quoted string is not closed: ''1;
    '. (SYNTAX_ERROR)

    , result:

    stdout:

    Database: test_hz2zwymr
    Child processes of 13041:
    13042 python3 ./tests/clickhouse-test 0001_select --test-runs 3 --max-failures-chain 1 --no-random-settings --no-random-merge-tree-settings
    Killing process group 13040
    Processes in process group 13040:
    13040 -bash
    13042 python3 ./tests/clickhouse-test 0001_select --test-runs 3 --max-failures-chain 1 --no-random-settings --no-random-merge-tree-settings

    [2]+  Stopped                 /tmp/test.sh
    [1]$ Process group 13040 should be killed
    Max failures chain

    [2]+  Killed                  /tmp/test.sh

After:

    $ /tmp/test.sh
    Using queries from '/src/ch/worktrees/clickhouse-upstream/tests/queries' directory
    Connecting to ClickHouse server... OK
    Connected to server 24.8.1.1 @ bef896ce143ea4e0464c9829de6277ba06cc1a53 mt/rename-without-lock-v2
    Running 3 stateless tests (MainProcess).
    00001_select_1:                                                         [ FAIL ]
    Reason: return code:  62
    Code: 62. DB::Exception: Syntax error: failed at position 8 (''1;
    '): '1;
    . Single quoted string is not closed: ''1;
    '. (SYNTAX_ERROR)

    , result:

    stdout:

    Database: test_urz6rk5z
    Child processes of 9782:
    9785 python3 ./tests/clickhouse-test 0001_select --test-runs 3 --max-failures-chain 1 --no-random-settings --no-random-merge-tree-settings
    Max failures chain

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 239375d7fec..dfcef86cf7e 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -3460,16 +3460,17 @@ def signal_handler(signal, frame):
 
 
 if __name__ == "__main__":
+    # Move to a new process group and kill it at exit so that we don't have any
+    # infinite tests processes left
+    # (new process group is required to avoid killing some parent processes)
+    os.setpgid(0, 0)
+
     stop_time = None
     exit_code = multiprocessing.Value("i", 0)
     server_died = multiprocessing.Event()
     multiprocessing_manager = multiprocessing.Manager()
     restarted_tests = multiprocessing_manager.list()
 
-    # Move to a new process group and kill it at exit so that we don't have any
-    # infinite tests processes left
-    # (new process group is required to avoid killing some parent processes)
-    os.setpgid(0, 0)
     signal.signal(signal.SIGTERM, signal_handler)
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGHUP, signal_handler)

From 8ce23ff1136c2de03348af92a595620eee703d9a Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 6 Aug 2024 16:28:17 +0200
Subject: [PATCH 21/56] tests: increase delay co capture client stacktraces for
 sanitizers build

5 seconds is too small and not enough to print even few frames.

  [1]: https://s3.amazonaws.com/clickhouse-test-reports/67737/9658be5eea8351655dd3ea77b8c1d4717bac7999/stress_test__ubsan_.html

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index dfcef86cf7e..46d0f9e007e 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -374,7 +374,7 @@ def kill_process_group(pgid):
         if RELEASE_BUILD:
             sleep(0.5)
         else:
-            sleep(5)
+            sleep(10)
         # NOTE: this still may leave some processes, that had been
         # created by timeout(1), since it also creates new process
         # group. But this should not be a problem with default

From 7c366a040fad26f7380adffb38c990f05f629c6d Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 6 Aug 2024 16:33:44 +0200
Subject: [PATCH 22/56] ci: use bash arrays to pass opts to clickhouse-test for
 stateless/stateful

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 docker/test/stateful/run.sh  | 24 +++++++++++++++++-------
 docker/test/stateless/run.sh | 20 +++++++++++++++-----
 2 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh
index 8e2f1890f89..1ad1f73395e 100755
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@@ -232,15 +232,25 @@ function run_tests()
 
     set +e
 
+    TEST_ARGS=(
+        -j 2
+        --testname
+        --shard
+        --zookeeper
+        --check-zookeeper-session
+        --no-stateless
+        --hung-check
+        --print-time
+        "${ADDITIONAL_OPTIONS[@]}"
+        "$SKIP_TESTS_OPTION"
+    )
     if [[ -n "$USE_PARALLEL_REPLICAS" ]] && [[ "$USE_PARALLEL_REPLICAS" -eq 1 ]]; then
-        clickhouse-test --client="clickhouse-client --allow_experimental_parallel_reading_from_replicas=1 --parallel_replicas_for_non_replicated_merge_tree=1 \
-            --max_parallel_replicas=100 --cluster_for_parallel_replicas='parallel_replicas'" \
-            -j 2 --testname --shard --zookeeper --check-zookeeper-session --no-stateless --no-parallel-replicas --hung-check --print-time "${ADDITIONAL_OPTIONS[@]}" \
-        "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt
-    else
-        clickhouse-test -j 2 --testname --shard --zookeeper --check-zookeeper-session --no-stateless --hung-check --print-time "${ADDITIONAL_OPTIONS[@]}" \
-        "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt
+        TEST_ARGS+=(
+            --client="clickhouse-client --allow_experimental_parallel_reading_from_replicas=1 --parallel_replicas_for_non_replicated_merge_tree=1 --max_parallel_replicas=100 --cluster_for_parallel_replicas='parallel_replicas'"
+            --no-parallel-replicas
+        )
     fi
+    clickhouse-test "${TEST_ARGS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt
     set -e
 }
 
diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh
index ea32df23af0..bcfc2020696 100755
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@@ -264,11 +264,21 @@ function run_tests()
     TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800))
     START_TIME=${SECONDS}
     set +e
-    timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s \
-        clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \
-            --no-drop-if-fail --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \
-    | ts '%Y-%m-%d %H:%M:%S' \
-    | tee -a test_output/test_result.txt
+
+    TEST_ARGS=(
+        --testname
+        --shard
+        --zookeeper
+        --check-zookeeper-session
+        --hung-check
+        --print-time
+        --no-drop-if-fail
+        --test-runs "$NUM_TRIES"
+        "${ADDITIONAL_OPTIONS[@]}"
+    )
+    timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
+        | ts '%Y-%m-%d %H:%M:%S' \
+        | tee -a test_output/test_result.txt
     set -e
     DURATION=$((SECONDS - START_TIME))
 

From 72bd43a309f8e327b7e252a9866dabd2496c26af Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 6 Aug 2024 16:36:05 +0200
Subject: [PATCH 23/56] tests: do not capture client stacktraces in stress
 tests

They are too uncontrollable, and likely will leave some clients [1].

  [1]: https://s3.amazonaws.com/clickhouse-test-reports/67737/9658be5eea8351655dd3ea77b8c1d4717bac7999/stress_test__ubsan_.html

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 docker/test/stateful/run.sh  |  1 +
 docker/test/stateless/run.sh |  1 +
 tests/clickhouse-test        | 28 +++++++++++++++++++++-------
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh
index 1ad1f73395e..3a4f0d97993 100755
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@@ -241,6 +241,7 @@ function run_tests()
         --no-stateless
         --hung-check
         --print-time
+        --capture-client-stacktrace
         "${ADDITIONAL_OPTIONS[@]}"
         "$SKIP_TESTS_OPTION"
     )
diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh
index bcfc2020696..063195181a8 100755
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@@ -273,6 +273,7 @@ function run_tests()
         --hung-check
         --print-time
         --no-drop-if-fail
+        --capture-client-stacktrace
         --test-runs "$NUM_TRIES"
         "${ADDITIONAL_OPTIONS[@]}"
     )
diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 46d0f9e007e..88ff6753a8f 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -358,7 +358,13 @@ def clickhouse_execute_json(
     return rows
 
 
+# Should we capture client's stacktraces via SIGTSTP
+CAPTURE_CLIENT_STACKTRACE = False
+
+
 def kill_process_group(pgid):
+    global CAPTURE_CLIENT_STACKTRACE
+
     print(f"Killing process group {pgid}")
     print(f"Processes in process group {pgid}:")
     print(
@@ -368,13 +374,14 @@ def kill_process_group(pgid):
         end="",
     )
     try:
-        # Let's try to dump stacktrace in client (useful to catch issues there)
-        os.killpg(pgid, signal.SIGTSTP)
-        # Wait some time for clickhouse utilities to gather stacktrace
-        if RELEASE_BUILD:
-            sleep(0.5)
-        else:
-            sleep(10)
+        if CAPTURE_CLIENT_STACKTRACE:
+            # Let's try to dump stacktrace in client (useful to catch issues there)
+            os.killpg(pgid, signal.SIGTSTP)
+            # Wait some time for clickhouse utilities to gather stacktrace
+            if RELEASE_BUILD:
+                sleep(0.5)
+            else:
+                sleep(10)
         # NOTE: this still may leave some processes, that had been
         # created by timeout(1), since it also creates new process
         # group. But this should not be a problem with default
@@ -3446,6 +3453,11 @@ def parse_args():
         default="./client.fatal.log",
         help="Path to file for fatal logs from client",
     )
+    parser.add_argument(
+        "--capture-client-stacktrace",
+        action="store_true",
+        help="Capture stacktraces from clickhouse-client/local on errors",
+    )
 
     return parser.parse_args()
 
@@ -3488,6 +3500,8 @@ if __name__ == "__main__":
         )
         sys.exit(1)
 
+    CAPTURE_CLIENT_STACKTRACE = args.capture_client_stacktrace
+
     # Autodetect the directory with queries if not specified
     if args.queries is None:
         args.queries = "queries"

From f20cfdb54ea9ee577f9747e1e2d99af2c0e9e250 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 6 Aug 2024 14:49:20 +0000
Subject: [PATCH 24/56] Cosmetics III

---
 src/Storages/Statistics/Statistics.cpp               |  2 +-
 src/Storages/Statistics/Statistics.h                 |  6 +++---
 src/Storages/Statistics/StatisticsCountMinSketch.cpp | 12 ++++++------
 src/Storages/Statistics/StatisticsCountMinSketch.h   |  6 +++---
 src/Storages/Statistics/StatisticsTDigest.cpp        | 10 +++++-----
 src/Storages/Statistics/StatisticsTDigest.h          |  4 ++--
 src/Storages/Statistics/StatisticsUniq.cpp           | 10 +++++-----
 src/Storages/Statistics/StatisticsUniq.h             |  4 ++--
 8 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp
index 771304405a6..e3f9fcc8192 100644
--- a/src/Storages/Statistics/Statistics.cpp
+++ b/src/Storages/Statistics/Statistics.cpp
@@ -224,7 +224,7 @@ MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance()
     return instance;
 }
 
-void MergeTreeStatisticsFactory::validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const
+void MergeTreeStatisticsFactory::validate(const ColumnStatisticsDescription & stats, const DataTypePtr & data_type) const
 {
     for (const auto & [type, desc] : stats.types_to_desc)
     {
diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h
index 16f0c67eabd..c6a45e68aa6 100644
--- a/src/Storages/Statistics/Statistics.h
+++ b/src/Storages/Statistics/Statistics.h
@@ -87,10 +87,10 @@ class MergeTreeStatisticsFactory : private boost::noncopyable
 public:
     static MergeTreeStatisticsFactory & instance();
 
-    void validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const;
+    void validate(const ColumnStatisticsDescription & stats, const DataTypePtr & data_type) const;
 
-    using Validator = std::function<void(const SingleStatisticsDescription & stats, DataTypePtr data_type)>;
-    using Creator = std::function<StatisticsPtr(const SingleStatisticsDescription & stats, DataTypePtr data_type)>;
+    using Validator = std::function<void(const SingleStatisticsDescription & stats, const DataTypePtr & data_type)>;
+    using Creator = std::function<StatisticsPtr(const SingleStatisticsDescription & stats, const DataTypePtr & data_type)>;
 
     ColumnStatisticsPtr get(const ColumnStatisticsDescription & stats) const;
     ColumnsStatistics getMany(const ColumnsDescription & columns) const;
diff --git a/src/Storages/Statistics/StatisticsCountMinSketch.cpp b/src/Storages/Statistics/StatisticsCountMinSketch.cpp
index dce5b39ae56..0dc01f5fcf0 100644
--- a/src/Storages/Statistics/StatisticsCountMinSketch.cpp
+++ b/src/Storages/Statistics/StatisticsCountMinSketch.cpp
@@ -25,7 +25,7 @@ extern const int ILLEGAL_STATISTICS;
 static constexpr auto num_hashes = 7uz;
 static constexpr auto num_buckets = 2718uz;
 
-StatisticsCountMinSketch::StatisticsCountMinSketch(const SingleStatisticsDescription & description, DataTypePtr data_type_)
+StatisticsCountMinSketch::StatisticsCountMinSketch(const SingleStatisticsDescription & description, const DataTypePtr & data_type_)
     : IStatistics(description)
     , sketch(num_hashes, num_buckets)
     , data_type(data_type_)
@@ -84,15 +84,15 @@ void StatisticsCountMinSketch::deserialize(ReadBuffer & buf)
 }
 
 
-void countMinSketchStatisticsValidator(const SingleStatisticsDescription & /*description*/, DataTypePtr data_type)
+void countMinSketchStatisticsValidator(const SingleStatisticsDescription & /*description*/, const DataTypePtr & data_type)
 {
-    data_type = removeNullable(data_type);
-    data_type = removeLowCardinalityAndNullable(data_type);
-    if (!data_type->isValueRepresentedByNumber() && !isStringOrFixedString(data_type))
+    DataTypePtr inner_data_type = removeNullable(data_type);
+    inner_data_type = removeLowCardinalityAndNullable(inner_data_type);
+    if (!inner_data_type->isValueRepresentedByNumber() && !isStringOrFixedString(inner_data_type))
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'count_min' does not support type {}", data_type->getName());
 }
 
-StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type)
+StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & data_type)
 {
     return std::make_shared<StatisticsCountMinSketch>(description, data_type);
 }
diff --git a/src/Storages/Statistics/StatisticsCountMinSketch.h b/src/Storages/Statistics/StatisticsCountMinSketch.h
index af01408f2a3..d1de1a3aea5 100644
--- a/src/Storages/Statistics/StatisticsCountMinSketch.h
+++ b/src/Storages/Statistics/StatisticsCountMinSketch.h
@@ -14,7 +14,7 @@ namespace DB
 class StatisticsCountMinSketch : public IStatistics
 {
 public:
-    StatisticsCountMinSketch(const SingleStatisticsDescription & description, DataTypePtr data_type_);
+    StatisticsCountMinSketch(const SingleStatisticsDescription & description, const DataTypePtr & data_type_);
 
     Float64 estimateEqual(const Field & val) const override;
 
@@ -31,8 +31,8 @@ private:
 };
 
 
-void countMinSketchStatisticsValidator(const SingleStatisticsDescription & description, DataTypePtr data_type);
-StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type);
+void countMinSketchStatisticsValidator(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
+StatisticsPtr countMinSketchStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
 
 }
 
diff --git a/src/Storages/Statistics/StatisticsTDigest.cpp b/src/Storages/Statistics/StatisticsTDigest.cpp
index 73ab6c84b4e..1cf92fea24b 100644
--- a/src/Storages/Statistics/StatisticsTDigest.cpp
+++ b/src/Storages/Statistics/StatisticsTDigest.cpp
@@ -57,15 +57,15 @@ Float64 StatisticsTDigest::estimateEqual(const Field & val) const
     throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
 }
 
-void tdigestStatisticsValidator(const SingleStatisticsDescription & /*description*/, DataTypePtr data_type)
+void tdigestStatisticsValidator(const SingleStatisticsDescription & /*description*/, const DataTypePtr & data_type)
 {
-    data_type = removeNullable(data_type);
-    data_type = removeLowCardinalityAndNullable(data_type);
-    if (!data_type->isValueRepresentedByNumber())
+    DataTypePtr inner_data_type = removeNullable(data_type);
+    inner_data_type = removeLowCardinalityAndNullable(inner_data_type);
+    if (!inner_data_type->isValueRepresentedByNumber())
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
 }
 
-StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr /*data_type*/)
+StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & /*data_type*/)
 {
     return std::make_shared<StatisticsTDigest>(description);
 }
diff --git a/src/Storages/Statistics/StatisticsTDigest.h b/src/Storages/Statistics/StatisticsTDigest.h
index 47d6c93f64c..2b37799d07b 100644
--- a/src/Storages/Statistics/StatisticsTDigest.h
+++ b/src/Storages/Statistics/StatisticsTDigest.h
@@ -23,7 +23,7 @@ private:
     QuantileTDigest<Float64> t_digest;
 };
 
-void tdigestStatisticsValidator(const SingleStatisticsDescription & description, DataTypePtr data_type);
-StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type);
+void tdigestStatisticsValidator(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
+StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
 
 }
diff --git a/src/Storages/Statistics/StatisticsUniq.cpp b/src/Storages/Statistics/StatisticsUniq.cpp
index e737f9987a5..07311b5b86d 100644
--- a/src/Storages/Statistics/StatisticsUniq.cpp
+++ b/src/Storages/Statistics/StatisticsUniq.cpp
@@ -52,15 +52,15 @@ UInt64 StatisticsUniq::estimateCardinality() const
     return column->getUInt(0);
 }
 
-void uniqStatisticsValidator(const SingleStatisticsDescription & /*description*/, DataTypePtr data_type)
+void uniqStatisticsValidator(const SingleStatisticsDescription & /*description*/, const DataTypePtr & data_type)
 {
-    data_type = removeNullable(data_type);
-    data_type = removeLowCardinalityAndNullable(data_type);
-    if (!data_type->isValueRepresentedByNumber())
+    DataTypePtr inner_data_type = removeNullable(data_type);
+    inner_data_type = removeLowCardinalityAndNullable(inner_data_type);
+    if (!inner_data_type->isValueRepresentedByNumber())
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' do not support type {}", data_type->getName());
 }
 
-StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type)
+StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & data_type)
 {
     return std::make_shared<StatisticsUniq>(description, data_type);
 }
diff --git a/src/Storages/Statistics/StatisticsUniq.h b/src/Storages/Statistics/StatisticsUniq.h
index 6b511d4f496..1fdcab8bd89 100644
--- a/src/Storages/Statistics/StatisticsUniq.h
+++ b/src/Storages/Statistics/StatisticsUniq.h
@@ -27,7 +27,7 @@ private:
 
 };
 
-void uniqStatisticsValidator(const SingleStatisticsDescription & description, DataTypePtr data_type);
-StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & description, DataTypePtr data_type);
+void uniqStatisticsValidator(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
+StatisticsPtr uniqStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
 
 }

From df2675fad0d1bcb79c8a2d7edd0c08b1da49a945 Mon Sep 17 00:00:00 2001
From: Michael Stetsyuk <mike.stetsyuk@gmail.com>
Date: Thu, 18 Jul 2024 12:32:16 +0200
Subject: [PATCH 25/56] [resubmit] add replication lag and recovery time
 metrics

---
 src/Databases/DatabaseReplicated.cpp          | 45 ++++++++++----
 src/Databases/DatabaseReplicated.h            | 12 +++-
 src/Databases/DatabaseReplicatedWorker.cpp    | 21 +++++++
 src/Databases/DatabaseReplicatedWorker.h      |  5 ++
 src/Storages/System/StorageSystemClusters.cpp | 49 +++++++++++++--
 src/Storages/System/StorageSystemClusters.h   |  2 +-
 .../test_recovery_time_metric/__init__.py     |  0
 .../configs/config.xml                        | 41 +++++++++++++
 .../test_recovery_time_metric/test.py         | 61 +++++++++++++++++++
 .../02117_show_create_table_system.reference  |  2 +
 .../03206_replication_lag_metric.reference    |  4 ++
 .../03206_replication_lag_metric.sql          | 11 ++++
 12 files changed, 234 insertions(+), 19 deletions(-)
 create mode 100644 tests/integration/test_recovery_time_metric/__init__.py
 create mode 100644 tests/integration/test_recovery_time_metric/configs/config.xml
 create mode 100644 tests/integration/test_recovery_time_metric/test.py
 create mode 100644 tests/queries/0_stateless/03206_replication_lag_metric.reference
 create mode 100644 tests/queries/0_stateless/03206_replication_lag_metric.sql

diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp
index f127ccbc224..213c94d4d94 100644
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@@ -12,6 +12,7 @@
 #include <Common/ZooKeeper/KeeperException.h>
 #include <Common/ZooKeeper/Types.h>
 #include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/ZooKeeper/IKeeper.h>
 #include <Common/PoolId.h>
 #include <Core/ServerSettings.h>
 #include <Core/Settings.h>
@@ -338,9 +339,12 @@ ClusterPtr DatabaseReplicated::getClusterImpl(bool all_groups) const
     return std::make_shared<Cluster>(getContext()->getSettingsRef(), shards, params);
 }
 
-std::vector<UInt8> DatabaseReplicated::tryGetAreReplicasActive(const ClusterPtr & cluster_) const
+ReplicasInfo DatabaseReplicated::tryGetReplicasInfo(const ClusterPtr & cluster_) const
 {
     Strings paths;
+
+    paths.emplace_back(fs::path(zookeeper_path) / "max_log_ptr");
+
     const auto & addresses_with_failover = cluster_->getShardsAddresses();
     const auto & shards_info = cluster_->getShardsInfo();
     for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index)
@@ -349,31 +353,50 @@ std::vector<UInt8> DatabaseReplicated::tryGetAreReplicasActive(const ClusterPtr
         {
             String full_name = getFullReplicaName(replica.database_shard_name, replica.database_replica_name);
             paths.emplace_back(fs::path(zookeeper_path) / "replicas" / full_name / "active");
+            paths.emplace_back(fs::path(zookeeper_path) / "replicas" / full_name / "log_ptr");
         }
     }
 
     try
     {
         auto current_zookeeper = getZooKeeper();
-        auto res = current_zookeeper->exists(paths);
+        auto zk_res = current_zookeeper->tryGet(paths);
 
-        std::vector<UInt8> statuses;
-        statuses.resize(paths.size());
+        auto max_log_ptr_zk = zk_res[0];
+        if (max_log_ptr_zk.error != Coordination::Error::ZOK)
+            throw Coordination::Exception(max_log_ptr_zk.error);
 
-        for (size_t i = 0; i < res.size(); ++i)
-            if (res[i].error == Coordination::Error::ZOK)
-                statuses[i] = 1;
+        UInt32 max_log_ptr = parse<UInt32>(max_log_ptr_zk.data);
 
-        return statuses;
-    }
-    catch (...)
+        ReplicasInfo replicas_info;
+        replicas_info.resize((zk_res.size() - 1) / 2);
+
+        size_t global_replica_index = 0;
+        for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index)
+        {
+            for (const auto & replica : addresses_with_failover[shard_index])
+            {
+                auto replica_active = zk_res[2 * global_replica_index + 1];
+                auto replica_log_ptr = zk_res[2 * global_replica_index + 2];
+
+                replicas_info[global_replica_index] = ReplicaInfo{
+                    .is_active = replica_active.error == Coordination::Error::ZOK,
+                    .replication_lag = replica_log_ptr.error != Coordination::Error::ZNONODE ? std::optional(max_log_ptr - parse<UInt32>(replica_log_ptr.data)) : std::nullopt,
+                    .recovery_time = replica.is_local && ddl_worker ? ddl_worker->getCurrentInitializationDurationMs() : 0,
+                };
+
+                ++global_replica_index;
+            }
+        }
+
+        return replicas_info;
+    } catch (...)
     {
         tryLogCurrentException(log);
         return {};
     }
 }
 
-
 void DatabaseReplicated::fillClusterAuthInfo(String collection_name, const Poco::Util::AbstractConfiguration & config_ref)
 {
     const auto & config_prefix = fmt::format("named_collections.{}", collection_name);
diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h
index 27ab262d1f1..db683be8f36 100644
--- a/src/Databases/DatabaseReplicated.h
+++ b/src/Databases/DatabaseReplicated.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <optional>
+
 #include <Databases/DatabaseAtomic.h>
 #include <Databases/DatabaseReplicatedSettings.h>
 #include <Common/ZooKeeper/ZooKeeper.h>
@@ -17,6 +19,14 @@ using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
 class Cluster;
 using ClusterPtr = std::shared_ptr<Cluster>;
 
+struct ReplicaInfo
+{
+    bool is_active;
+    std::optional<UInt32> replication_lag;
+    UInt64 recovery_time;
+};
+using ReplicasInfo = std::vector<ReplicaInfo>;
+
 class DatabaseReplicated : public DatabaseAtomic
 {
 public:
@@ -84,7 +94,7 @@ public:
 
     static void dropReplica(DatabaseReplicated * database, const String & database_zookeeper_path, const String & shard, const String & replica, bool throw_if_noop);
 
-    std::vector<UInt8> tryGetAreReplicasActive(const ClusterPtr & cluster_) const;
+    ReplicasInfo tryGetReplicasInfo(const ClusterPtr & cluster_) const;
 
     void renameDatabase(ContextPtr query_context, const String & new_name) override;
 
diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp
index 1ef88dc03bc..4e7408aa96e 100644
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@@ -32,6 +32,12 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db
 
 bool DatabaseReplicatedDDLWorker::initializeMainThread()
 {
+    {
+        std::lock_guard lock(initialization_duration_timer_mutex);
+        initialization_duration_timer.emplace();
+        initialization_duration_timer->start();
+    }
+
     while (!stop_flag)
     {
         try
@@ -69,6 +75,10 @@ bool DatabaseReplicatedDDLWorker::initializeMainThread()
 
             initializeReplication();
             initialized = true;
+            {
+                std::lock_guard lock(initialization_duration_timer_mutex);
+                initialization_duration_timer.reset();
+            }
             return true;
         }
         catch (...)
@@ -78,6 +88,11 @@ bool DatabaseReplicatedDDLWorker::initializeMainThread()
         }
     }
 
+    {
+        std::lock_guard lock(initialization_duration_timer_mutex);
+        initialization_duration_timer.reset();
+    }
+
     return false;
 }
 
@@ -459,4 +474,10 @@ UInt32 DatabaseReplicatedDDLWorker::getLogPointer() const
     return max_id.load();
 }
 
+UInt64 DatabaseReplicatedDDLWorker::getCurrentInitializationDurationMs() const
+{
+    std::lock_guard lock(initialization_duration_timer_mutex);
+    return initialization_duration_timer ? initialization_duration_timer->elapsedMilliseconds() : 0;
+}
+
 }
diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h
index 41edf2221b8..2309c831839 100644
--- a/src/Databases/DatabaseReplicatedWorker.h
+++ b/src/Databases/DatabaseReplicatedWorker.h
@@ -36,6 +36,8 @@ public:
                                    DatabaseReplicated * const database, bool committed = false); /// NOLINT
 
     UInt32 getLogPointer() const;
+
+    UInt64 getCurrentInitializationDurationMs() const;
 private:
     bool initializeMainThread() override;
     void initializeReplication();
@@ -56,6 +58,9 @@ private:
     ZooKeeperPtr active_node_holder_zookeeper;
     /// It will remove "active" node when database is detached
     zkutil::EphemeralNodeHolderPtr active_node_holder;
+
+    std::optional<Stopwatch> initialization_duration_timer;
+    mutable std::mutex initialization_duration_timer_mutex;
 };
 
 }
diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp
index 9c5c07ae49f..db1955c2e99 100644
--- a/src/Storages/System/StorageSystemClusters.cpp
+++ b/src/Storages/System/StorageSystemClusters.cpp
@@ -1,3 +1,4 @@
+#include <optional>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeNullable.h>
@@ -31,6 +32,8 @@ ColumnsDescription StorageSystemClusters::getColumnsDescription()
         {"database_shard_name", std::make_shared<DataTypeString>(), "The name of the `Replicated` database shard (for clusters that belong to a `Replicated` database)."},
         {"database_replica_name", std::make_shared<DataTypeString>(), "The name of the `Replicated` database replica (for clusters that belong to a `Replicated` database)."},
         {"is_active", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt8>()), "The status of the Replicated database replica (for clusters that belong to a Replicated database): 1 means 'replica is online', 0 means 'replica is offline', NULL means 'unknown'."},
+        {"replication_lag", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt32>()), "The replication lag of the `Replicated` database replica (for clusters that belong to a Replicated database)."},
+        {"recovery_time", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()), "The recovery time of the `Replicated` database replica (for clusters that belong to a Replicated database), in milliseconds."},
     };
 
     description.setAliases({
@@ -67,6 +70,10 @@ void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const std
     const auto & shards_info = cluster->getShardsInfo();
     const auto & addresses_with_failover = cluster->getShardsAddresses();
 
+    ReplicasInfo replicas_info;
+    if (replicated)
+        replicas_info = replicated->tryGetReplicasInfo(name_and_cluster.second);
+
     size_t replica_idx = 0;
     for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index)
     {
@@ -114,17 +121,47 @@ void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const std
                 res_columns[res_index++]->insert(address.database_shard_name);
             if (columns_mask[src_index++])
                 res_columns[res_index++]->insert(address.database_replica_name);
+
             if (columns_mask[src_index++])
             {
-                std::vector<UInt8> is_active;
-                if (replicated)
-                    is_active = replicated->tryGetAreReplicasActive(name_and_cluster.second);
-
-                if (is_active.empty())
+                if (replicas_info.empty())
                     res_columns[res_index++]->insertDefault();
                 else
-                    res_columns[res_index++]->insert(is_active[replica_idx++]);
+                {
+                    const auto & replica_info = replicas_info[replica_idx];
+                    res_columns[res_index++]->insert(replica_info.is_active);
+                }
             }
+
+            if (columns_mask[src_index++])
+            {
+                if (replicas_info.empty())
+                    res_columns[res_index++]->insertDefault();
+                else
+                {
+                    const auto & replica_info = replicas_info[replica_idx];
+                    if (replica_info.replication_lag != std::nullopt)
+                        res_columns[res_index++]->insert(*replica_info.replication_lag);
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
+            }
+
+            if (columns_mask[src_index++])
+            {
+                if (replicas_info.empty())
+                    res_columns[res_index++]->insertDefault();
+                else
+                {
+                    const auto & replica_info = replicas_info[replica_idx];
+                    if (replica_info.recovery_time != 0)
+                        res_columns[res_index++]->insert(replica_info.recovery_time);
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
+            }
+
+            ++replica_idx;
         }
     }
 }
diff --git a/src/Storages/System/StorageSystemClusters.h b/src/Storages/System/StorageSystemClusters.h
index f6adb902f43..a5f6d551ca1 100644
--- a/src/Storages/System/StorageSystemClusters.h
+++ b/src/Storages/System/StorageSystemClusters.h
@@ -1,10 +1,10 @@
 #pragma once
 
+#include <Databases/DatabaseReplicated.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <Storages/System/IStorageSystemOneBlock.h>
 
-
 namespace DB
 {
 
diff --git a/tests/integration/test_recovery_time_metric/__init__.py b/tests/integration/test_recovery_time_metric/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_recovery_time_metric/configs/config.xml b/tests/integration/test_recovery_time_metric/configs/config.xml
new file mode 100644
index 00000000000..bad9b1fa9ea
--- /dev/null
+++ b/tests/integration/test_recovery_time_metric/configs/config.xml
@@ -0,0 +1,41 @@
+<clickhouse>
+    <tcp_port>9000</tcp_port>
+
+    <profiles>
+        <default>
+        </default>
+    </profiles>
+
+    <users>
+        <default>
+            <profile>default</profile>
+            <no_password></no_password>
+        </default>
+    </users>
+
+    <keeper_server>
+        <tcp_port>2181</tcp_port>
+        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+        <snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
+        <coordination_settings>
+            <session_timeout_ms>20000</session_timeout_ms>
+        </coordination_settings>
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>localhost</hostname>
+                <port>9444</port>
+            </server>
+        </raft_configuration>
+    </keeper_server>
+
+    <zookeeper>
+        <node index="1">
+            <host>localhost</host>
+            <port>2181</port>
+        </node>
+        <session_timeout_ms>20000</session_timeout_ms>
+    </zookeeper>
+
+</clickhouse>
diff --git a/tests/integration/test_recovery_time_metric/test.py b/tests/integration/test_recovery_time_metric/test.py
new file mode 100644
index 00000000000..6fcf2fad423
--- /dev/null
+++ b/tests/integration/test_recovery_time_metric/test.py
@@ -0,0 +1,61 @@
+import pytest
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+node = cluster.add_instance(
+    "node",
+    main_configs=["configs/config.xml"],
+    stay_alive=True,
+)
+
+
+@pytest.fixture(scope="module")
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def test_recovery_time_metric(start_cluster):
+    node.query(
+        """
+        DROP DATABASE IF EXISTS rdb;
+        CREATE DATABASE rdb
+        ENGINE = Replicated('/test/test_recovery_time_metric', 'shard1', 'replica1')
+        """
+    )
+
+    node.query(
+        """
+        DROP TABLE IF EXISTS rdb.t;
+        CREATE TABLE rdb.t
+        (
+            `x` UInt32
+        )
+        ENGINE = MergeTree
+        ORDER BY x
+        """
+    )
+
+    node.exec_in_container(["bash", "-c", "rm /var/lib/clickhouse/metadata/rdb/t.sql"])
+
+    node.restart_clickhouse()
+
+    ret = int(
+        node.query(
+            """
+            SELECT recovery_time
+            FROM system.clusters
+            WHERE cluster = 'rdb'
+            """
+        ).strip()
+    )
+    assert ret > 0
+
+    node.query(
+        """
+        DROP DATABASE rdb
+        """
+    )
diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference
index cfae4fee6c2..32e8b2f4312 100644
--- a/tests/queries/0_stateless/02117_show_create_table_system.reference
+++ b/tests/queries/0_stateless/02117_show_create_table_system.reference
@@ -52,6 +52,8 @@ CREATE TABLE system.clusters
     `database_shard_name` String,
     `database_replica_name` String,
     `is_active` Nullable(UInt8),
+    `replication_lag` Nullable(UInt32),
+    `recovery_time` Nullable(UInt64),
     `name` String ALIAS cluster
 )
 ENGINE = SystemClusters
diff --git a/tests/queries/0_stateless/03206_replication_lag_metric.reference b/tests/queries/0_stateless/03206_replication_lag_metric.reference
new file mode 100644
index 00000000000..02f4a7264b1
--- /dev/null
+++ b/tests/queries/0_stateless/03206_replication_lag_metric.reference
@@ -0,0 +1,4 @@
+0
+2
+0
+2
diff --git a/tests/queries/0_stateless/03206_replication_lag_metric.sql b/tests/queries/0_stateless/03206_replication_lag_metric.sql
new file mode 100644
index 00000000000..998c332a11c
--- /dev/null
+++ b/tests/queries/0_stateless/03206_replication_lag_metric.sql
@@ -0,0 +1,11 @@
+-- Tags: no-parallel
+
+CREATE DATABASE rdb1 ENGINE = Replicated('/test/test_replication_lag_metric', 'shard1', 'replica1');
+CREATE DATABASE rdb2 ENGINE = Replicated('/test/test_replication_lag_metric', 'shard1', 'replica2');
+
+SET distributed_ddl_task_timeout = 0;
+CREATE TABLE rdb1.t (id UInt32) ENGINE = ReplicatedMergeTree ORDER BY id;
+SELECT replication_lag FROM system.clusters WHERE cluster IN ('rdb1', 'rdb2') ORDER BY cluster ASC, replica_num ASC;
+
+DROP DATABASE rdb1;
+DROP DATABASE rdb2;

From 54ba7703b1ccd116dceefe8b0e9c6aca5c24d212 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 6 Aug 2024 15:16:43 +0000
Subject: [PATCH 26/56] Fix #67742

---
 .../ConditionSelectivityEstimator.cpp         | 38 ++++++----------
 .../ConditionSelectivityEstimator.h           | 10 ++---
 src/Storages/Statistics/Statistics.cpp        | 40 ++---------------
 src/Storages/Statistics/Statistics.h          |  7 ---
 src/Storages/Statistics/StatisticsTDigest.cpp | 43 ++++++++++---------
 src/Storages/Statistics/StatisticsTDigest.h   |  3 +-
 .../02864_statistics_bugs.reference           |  1 +
 .../0_stateless/02864_statistics_bugs.sql     |  9 ++++
 8 files changed, 54 insertions(+), 97 deletions(-)
 create mode 100644 tests/queries/0_stateless/02864_statistics_bugs.reference
 create mode 100644 tests/queries/0_stateless/02864_statistics_bugs.sql

diff --git a/src/Storages/Statistics/ConditionSelectivityEstimator.cpp b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp
index 57dff958b9a..432659f51f8 100644
--- a/src/Storages/Statistics/ConditionSelectivityEstimator.cpp
+++ b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp
@@ -19,7 +19,7 @@ void ConditionSelectivityEstimator::ColumnSelectivityEstimator::merge(String par
 Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(const Field & val, Float64 rows) const
 {
     if (part_statistics.empty())
-        return default_normal_cond_factor * rows;
+        return default_cond_range_factor * rows;
     Float64 result = 0;
     Float64 part_rows = 0;
     for (const auto & [key, estimator] : part_statistics)
@@ -39,13 +39,7 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual
 {
     if (part_statistics.empty())
     {
-        auto float_val = StatisticsUtils::tryConvertToFloat64(val);
-        if (!float_val)
-            return default_unknown_cond_factor * rows;
-        else if (float_val.value() < - threshold || float_val.value() > threshold)
-            return default_normal_cond_factor * rows;
-        else
-            return default_good_cond_factor * rows;
+        return default_cond_equal_factor * rows;
     }
     Float64 result = 0;
     Float64 partial_cnt = 0;
@@ -149,30 +143,22 @@ Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode
 
     auto [op, val] = extractBinaryOp(node, col);
 
+    if (dummy)
+    {
+        if (op == "equals")
+            return default_cond_equal_factor * total_rows;
+        else if (op == "less" || op == "lessOrEquals" || op == "greater" || op == "greaterOrEquals")
+            return default_cond_range_factor * total_rows;
+        else
+            return default_unknown_cond_factor * total_rows;
+    }
+
     if (op == "equals")
-    {
-        if (dummy)
-        {
-            auto float_val = StatisticsUtils::tryConvertToFloat64(val);
-            if (!float_val || (float_val < - threshold || float_val > threshold))
-                return default_normal_cond_factor * total_rows;
-            else
-                return default_good_cond_factor * total_rows;
-        }
         return estimator.estimateEqual(val, total_rows);
-    }
     else if (op == "less" || op == "lessOrEquals")
-    {
-        if (dummy)
-            return default_normal_cond_factor * total_rows;
         return estimator.estimateLess(val, total_rows);
-    }
     else if (op == "greater" || op == "greaterOrEquals")
-    {
-        if (dummy)
-            return default_normal_cond_factor * total_rows;
         return estimator.estimateGreater(val, total_rows);
-    }
     else
         return default_unknown_cond_factor * total_rows;
 }
diff --git a/src/Storages/Statistics/ConditionSelectivityEstimator.h b/src/Storages/Statistics/ConditionSelectivityEstimator.h
index ce7fdd12e92..269ee9ac6cb 100644
--- a/src/Storages/Statistics/ConditionSelectivityEstimator.h
+++ b/src/Storages/Statistics/ConditionSelectivityEstimator.h
@@ -38,12 +38,10 @@ private:
 
     std::pair<String, Field> extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const;
 
-    static constexpr auto default_good_cond_factor = 0.1;
-    static constexpr auto default_normal_cond_factor = 0.5;
-    static constexpr auto default_unknown_cond_factor = 1.0;
-    /// Conditions like "x = N" are considered good if abs(N) > threshold.
-    /// This is used to assume that condition is likely to have good selectivity.
-    static constexpr auto threshold = 2;
+    /// Used to estimate the selectivity of a condition when there is no statistics.
+    static constexpr auto default_cond_range_factor = 0.5;
+    static constexpr auto default_cond_equal_factor = 0.01;
+    static constexpr auto default_unknown_cond_factor = 1;
 
     UInt64 total_rows = 0;
     std::map<String, ColumnSelectivityEstimator> column_estimators;
diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp
index e3f9fcc8192..52eec437ac2 100644
--- a/src/Storages/Statistics/Statistics.cpp
+++ b/src/Storages/Statistics/Statistics.cpp
@@ -27,36 +27,6 @@ enum StatisticsFileVersion : UInt16
     V0 = 0,
 };
 
-std::optional<Float64> StatisticsUtils::tryConvertToFloat64(const Field & field)
-{
-    switch (field.getType())
-    {
-        case Field::Types::Int64:
-            return field.get<Int64>();
-        case Field::Types::UInt64:
-            return field.get<UInt64>();
-        case Field::Types::Float64:
-            return field.get<Float64>();
-        case Field::Types::Int128:
-            return field.get<Int128>();
-        case Field::Types::UInt128:
-            return field.get<UInt128>();
-        case Field::Types::Int256:
-            return field.get<Int256>();
-        case Field::Types::UInt256:
-            return field.get<UInt256>();
-        default:
-            return {};
-    }
-}
-
-std::optional<String> StatisticsUtils::tryConvertToString(const DB::Field & field)
-{
-    if (field.getType() == Field::Types::String)
-        return field.get<String>();
-    return {};
-}
-
 IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
     : stat(stat_)
 {
@@ -105,7 +75,7 @@ Float64 ColumnStatistics::estimateLess(const Field & val) const
 {
     if (stats.contains(StatisticsType::TDigest))
         return stats.at(StatisticsType::TDigest)->estimateLess(val);
-    return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
+    return rows * ConditionSelectivityEstimator::default_cond_range_factor;
 }
 
 Float64 ColumnStatistics::estimateGreater(const Field & val) const
@@ -115,8 +85,7 @@ Float64 ColumnStatistics::estimateGreater(const Field & val) const
 
 Float64 ColumnStatistics::estimateEqual(const Field & val) const
 {
-    auto float_val = StatisticsUtils::tryConvertToFloat64(val);
-    if (float_val.has_value() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
+    if (stats_desc.data_type->isValueRepresentedByNumber() && stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest))
     {
         /// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) for every bucket.
         if (stats.at(StatisticsType::Uniq)->estimateCardinality() < 2048)
@@ -126,10 +95,7 @@ Float64 ColumnStatistics::estimateEqual(const Field & val) const
     if (stats.contains(StatisticsType::CountMinSketch))
         return stats.at(StatisticsType::CountMinSketch)->estimateEqual(val);
 #endif
-    if (!float_val.has_value() && (float_val < - ConditionSelectivityEstimator::threshold || float_val > ConditionSelectivityEstimator::threshold))
-        return rows * ConditionSelectivityEstimator::default_normal_cond_factor;
-    else
-        return rows * ConditionSelectivityEstimator::default_good_cond_factor;
+    return rows * ConditionSelectivityEstimator::default_cond_equal_factor;
 }
 
 /// -------------------------------------
diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h
index c6a45e68aa6..593ac20edb5 100644
--- a/src/Storages/Statistics/Statistics.h
+++ b/src/Storages/Statistics/Statistics.h
@@ -15,13 +15,6 @@ constexpr auto STATS_FILE_PREFIX = "statistics_";
 constexpr auto STATS_FILE_SUFFIX = ".stats";
 
 
-struct StatisticsUtils
-{
-    /// Returns std::nullopt if input Field cannot be converted to a concrete value
-    static std::optional<Float64> tryConvertToFloat64(const Field & field);
-    static std::optional<String> tryConvertToString(const Field & field);
-};
-
 /// Statistics describe properties of the values in the column,
 /// e.g. how many unique values exist,
 /// what are the N most frequent values,
diff --git a/src/Storages/Statistics/StatisticsTDigest.cpp b/src/Storages/Statistics/StatisticsTDigest.cpp
index 1cf92fea24b..b0c4bfda27d 100644
--- a/src/Storages/Statistics/StatisticsTDigest.cpp
+++ b/src/Storages/Statistics/StatisticsTDigest.cpp
@@ -1,6 +1,8 @@
 #include <Storages/Statistics/StatisticsTDigest.h>
-#include <DataTypes/DataTypeNullable.h>
+#include <Common/FieldVisitorConvertToNumber.h>
 #include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <Interpreters/convertFieldToType.h>
 
 namespace DB
 {
@@ -10,24 +12,21 @@ extern const int ILLEGAL_STATISTICS;
 extern const int LOGICAL_ERROR;
 }
 
-StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & description)
+StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & description, const DataTypePtr & data_type_)
     : IStatistics(description)
+    , data_type(data_type_)
 {
 }
 
 void StatisticsTDigest::update(const ColumnPtr & column)
 {
-    size_t rows = column->size();
-    for (size_t row = 0; row < rows; ++row)
+    for (size_t row = 0; row < column->size(); ++row)
     {
-        Field field;
-        column->get(row, field);
-
-        if (field.isNull())
+        if (column->isNullAt(row))
             continue;
 
-        if (auto field_as_float = StatisticsUtils::tryConvertToFloat64(field))
-            t_digest.add(*field_as_float, 1);
+        auto data = column->getFloat64(row);
+        t_digest.add(data, 1);
     }
 }
 
@@ -43,18 +42,22 @@ void StatisticsTDigest::deserialize(ReadBuffer & buf)
 
 Float64 StatisticsTDigest::estimateLess(const Field & val) const
 {
-    auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
-    if (val_as_float)
-        return t_digest.getCountLessThan(*val_as_float);
-    throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
+    Field val_converted = convertFieldToType(val, *data_type);
+    if (val_converted.isNull())
+        return 0;
+
+    auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
+    return t_digest.getCountLessThan(val_as_float);
 }
 
 Float64 StatisticsTDigest::estimateEqual(const Field & val) const
 {
-    auto val_as_float = StatisticsUtils::tryConvertToFloat64(val);
-    if (val_as_float)
-        return t_digest.getCountEqual(*val_as_float);
-    throw Exception(ErrorCodes::LOGICAL_ERROR, "Statistics 'tdigest' does not support estimating value of type {}", val.getTypeName());
+    Field val_converted = convertFieldToType(val, *data_type);
+    if (val_converted.isNull())
+        return 0;
+
+    auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
+    return t_digest.getCountEqual(val_as_float);
 }
 
 void tdigestStatisticsValidator(const SingleStatisticsDescription & /*description*/, const DataTypePtr & data_type)
@@ -65,9 +68,9 @@ void tdigestStatisticsValidator(const SingleStatisticsDescription & /*descriptio
         throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' do not support type {}", data_type->getName());
 }
 
-StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & /*data_type*/)
+StatisticsPtr tdigestStatisticsCreator(const SingleStatisticsDescription & description, const DataTypePtr & data_type)
 {
-    return std::make_shared<StatisticsTDigest>(description);
+    return std::make_shared<StatisticsTDigest>(description, data_type);
 }
 
 }
diff --git a/src/Storages/Statistics/StatisticsTDigest.h b/src/Storages/Statistics/StatisticsTDigest.h
index 2b37799d07b..5e744fee2ce 100644
--- a/src/Storages/Statistics/StatisticsTDigest.h
+++ b/src/Storages/Statistics/StatisticsTDigest.h
@@ -9,7 +9,7 @@ namespace DB
 class StatisticsTDigest : public IStatistics
 {
 public:
-    explicit StatisticsTDigest(const SingleStatisticsDescription & description);
+    explicit StatisticsTDigest(const SingleStatisticsDescription & description, const DataTypePtr & data_type_);
 
     void update(const ColumnPtr & column) override;
 
@@ -21,6 +21,7 @@ public:
 
 private:
     QuantileTDigest<Float64> t_digest;
+    DataTypePtr data_type;
 };
 
 void tdigestStatisticsValidator(const SingleStatisticsDescription & description, const DataTypePtr & data_type);
diff --git a/tests/queries/0_stateless/02864_statistics_bugs.reference b/tests/queries/0_stateless/02864_statistics_bugs.reference
new file mode 100644
index 00000000000..f599e28b8ab
--- /dev/null
+++ b/tests/queries/0_stateless/02864_statistics_bugs.reference
@@ -0,0 +1 @@
+10
diff --git a/tests/queries/0_stateless/02864_statistics_bugs.sql b/tests/queries/0_stateless/02864_statistics_bugs.sql
new file mode 100644
index 00000000000..ef1735550e6
--- /dev/null
+++ b/tests/queries/0_stateless/02864_statistics_bugs.sql
@@ -0,0 +1,9 @@
+SET allow_experimental_statistics = 1;
+SET allow_statistics_optimize = 1;
+SET mutations_sync = 1;
+
+DROP TABLE IF EXISTS bug_67742;
+CREATE TABLE bug_67742 (a Float64 STATISTICS(tdigest)) Engine = MergeTree() ORDER BY tuple();
+INSERT INTO bug_67742 SELECT number FROM system.numbers LIMIT 10000;
+SELECT count(*) FROM bug_67742 WHERE a < '10';
+DROP TABLE bug_67742;

From bf33aabec412aa2729bfd58f3e717c5b8285acb8 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Mon, 17 Jun 2024 10:39:10 +0200
Subject: [PATCH 27/56] Add documentation.

(cherry picked from commit 083fff6ed6ccff44b678ae3ea6af75501d9359fb)
---
 docs/en/engines/table-engines/index.md        |   1 +
 .../table-engines/integrations/time-series.md | 295 ++++++++++++++++++
 docs/en/interfaces/prometheus.md              | 160 ++++++++++
 .../settings.md                               |  42 ---
 docs/en/operations/settings/settings.md       |  11 +
 .../table-functions/timeSeriesData.md         |  28 ++
 .../table-functions/timeSeriesMetrics.md      |  28 ++
 .../table-functions/timeSeriesTags.md         |  28 ++
 .../aspell-ignore/en/aspell-dict.txt          |   6 +
 9 files changed, 557 insertions(+), 42 deletions(-)
 create mode 100644 docs/en/engines/table-engines/integrations/time-series.md
 create mode 100644 docs/en/interfaces/prometheus.md
 create mode 100644 docs/en/sql-reference/table-functions/timeSeriesData.md
 create mode 100644 docs/en/sql-reference/table-functions/timeSeriesMetrics.md
 create mode 100644 docs/en/sql-reference/table-functions/timeSeriesTags.md

diff --git a/docs/en/engines/table-engines/index.md b/docs/en/engines/table-engines/index.md
index 5e81eacc937..20c7c511aa9 100644
--- a/docs/en/engines/table-engines/index.md
+++ b/docs/en/engines/table-engines/index.md
@@ -61,6 +61,7 @@ Engines in the family:
 - [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md)
 - [PostgreSQL](../../engines/table-engines/integrations/postgresql.md)
 - [S3Queue](../../engines/table-engines/integrations/s3queue.md)
+- [TimeSeries](../../engines/table-engines/integrations/time-series.md)
 
 ### Special Engines {#special-engines}
 
diff --git a/docs/en/engines/table-engines/integrations/time-series.md b/docs/en/engines/table-engines/integrations/time-series.md
new file mode 100644
index 00000000000..4830fd61d27
--- /dev/null
+++ b/docs/en/engines/table-engines/integrations/time-series.md
@@ -0,0 +1,295 @@
+---
+slug: /en/engines/table-engines/special/time_series
+sidebar_position: 60
+sidebar_label: TimeSeries
+---
+
+# TimeSeries Engine [Experimental]
+
+A table engine storing time series, i.e. a set of values associated with timestamps and tags (or labels):
+
+```
+metric_name1[tag1=value1, tag2=value2, ...] = {timestamp1: value1, timestamp2: value2, ...}
+metric_name2[...] = ...
+```
+
+:::info
+This is an experimental feature that may change in backwards-incompatible ways in the future releases.
+Enable usage of the TimeSeries table engine
+with [allow_experimental_time_series_table](../../../operations/settings/settings.md#allow-experimental-time-series-table) setting.
+Input the command `set allow_experimental_time_series_table = 1`.
+:::
+
+## Syntax {#syntax}
+
+``` sql
+CREATE TABLE name [(columns)] ENGINE=TimeSeries
+[SETTINGS var1=value1, ...]
+[DATA db.data_table_name | DATA ENGINE data_table_engine(arguments)]
+[TAGS db.tags_table_name | TAGS ENGINE tags_table_engine(arguments)]
+[METRICS db.metrics_table_name | METRICS ENGINE metrics_table_engine(arguments)]
+```
+
+## Usage {#usage}
+
+It's easier to start with everything set by default (it's allowed to create a `TimeSeries` table without specifying a list of columns):
+
+``` sql
+CREATE TABLE my_table ENGINE=TimeSeries
+```
+
+Then this table can be used with the following protocols (a port must be assigned in the server configuration):
+- [prometheus remote-write](../../../interfaces/prometheus.md#remote-write)
+- [prometheus remote-read](../../../interfaces/prometheus.md#remote-read)
+
+## Target tables {#target-tables}
+
+A `TimeSeries` table doesn't have its own data, everything is stored in its target tables.
+This is similar to how a [materialized view](../../../sql-reference/statements/create/view#materialized-view) works,
+with the difference that a materialized view has one target table
+whereas a `TimeSeries` table has three target tables named [data]{#data-table}, [tags]{#tags-table], and [metrics]{#metrics-table}.
+
+The target tables can be either specified explicitly in the `CREATE TABLE` query
+or the `TimeSeries` table engine can generate inner target tables automatically.
+
+The target tables are the following:
+1. The _data_ table {#data-table} contains time series associated with some identifier.
+The _data_ table must have columns:
+
+| Name | Mandatory? | Default type | Possible types | Description |
+|---|---|---|---|---|
+| `id` | [x] | `UUID` | any | Identifies a combination of a metric names and tags |
+| `timestamp` | [x] | `DateTime64(3)` | `DateTime64(X)` | A time point |
+| `value` | [x] | `Float64` | `Float32` or `Float64` | A value associated with the `timestamp` |
+
+2. The _tags_ table {#tags-table} contains identifiers calculated for each combination of a metric name and tags.
+The _tags_ table must have columns:
+
+| Name | Mandatory? | Default type | Possible types | Description |
+|---|---|---|---|---|
+| `id` | [x] | `UUID` | any (must match the type of `id` in the [data]{#data-table} table) | An `id` identifies a combination of a metric name and tags. The DEFAULT expression specifies how to calculate such an identifier |
+| `metric_name` | [x] | `LowCardinality(String)` | `String` or `LowCardinality(String)` | The name of a metric |
+| `<tag_value_column>` | [ ] | `String` | `String` or `LowCardinality(String)` or `LowCardinality(Nullable(String))` | The value of a specific tag, the tag's name and the name of a corresponding column are specified in the [tags_to_columns](#settings) setting |
+| `tags` | [x] | `Map(LowCardinality(String), String)` | `Map(String, String)` or `Map(LowCardinality(String), String)` or `Map(LowCardinality(String), LowCardinality(String))` | Map of tags excluding the tag `__name__` containing the name of a metric and excluding tags with names enumerated in the [tags_to_columns](#settings) setting |
+| `all_tags` | [ ] | `Map(String, String)` | `Map(String, String)` or `Map(LowCardinality(String), String)` or `Map(LowCardinality(String), LowCardinality(String))` | Ephemeral column, each row is a map of all the tags excluding only the tag `__name__` containing the name of a metric. The only purpose of that column is to be used while calculating `id` |
+| `min_time` | [ ] | `Nullable(DateTime64(3))` | `DateTime64(X)` or `Nullable(DateTime64(X))` | Minimum timestamp of time series with that `id`. The column is created if [store_min_time_and_max_time](#settings) is `true` |
+| `max_time` | [ ] | `Nullable(DateTime64(3))` | `DateTime64(X)` or `Nullable(DateTime64(X))` | Maximum timestamp of time series with that `id`. The column is created if [store_min_time_and_max_time](#settings) is `true` |
+
+3. The _metrics_ table {#metrics-table} contains some information about metrics been collected, the types of those metrics and their descriptions.
+The _metrics_ table must have columns:
+
+| Name | Mandatory? | Default type | Possible types | Description |
+|---|---|---|---|---|
+| `metric_family_name` | [x] | `String` | `String` or `LowCardinality(String)` | The name of a metric family |
+| `type` | [x] | `String` | `String` or `LowCardinality(String)` | The type of a metric family, one of "counter", "gauge", "summary", "stateset", "histogram", "gaugehistogram" |
+| `unit` | [x] | `String` | `String` or `LowCardinality(String)` | The unit used in a metric |
+| `help` | [x] | `String` | `String` or `LowCardinality(String)` | The description of a metric |
+
+Any row inserted into a `TimeSeries` table will be in fact stored in those three target tables.
+A `TimeSeries` table contains all those columns from the [data]{#data-table}, [tags]{#tags-table}, [metrics]{#metrics-table} tables.
+
+## Creation {#creation}
+
+There are multiple ways to create a table with the `TimeSeries` table engine.
+The simplest statement
+
+``` sql
+CREATE TABLE my_table ENGINE=TimeSeries
+```
+
+will actually create the following table (you can see that by executing `SHOW CREATE TABLE my_table`):
+
+``` sql
+CREATE TABLE my_table
+(
+    `id` UUID DEFAULT reinterpretAsUUID(sipHash128(metric_name, all_tags)),
+    `timestamp` DateTime64(3),
+    `value` Float64,
+    `metric_name` LowCardinality(String),
+    `tags` Map(LowCardinality(String), String),
+    `all_tags` Map(String, String),
+    `min_time` Nullable(DateTime64(3)),
+    `max_time` Nullable(DateTime64(3)),
+    `metric_family_name` String,
+    `type` String,
+    `unit` String,
+    `help` String
+)
+ENGINE = TimeSeries
+DATA ENGINE = MergeTree ORDER BY (id, timestamp)
+DATA INNER UUID '01234567-89ab-cdef-0123-456789abcdef'
+TAGS ENGINE = AggregatingMergeTree PRIMARY KEY metric_name ORDER BY (metric_name, id)
+TAGS INNER UUID '01234567-89ab-cdef-0123-456789abcdef'
+METRICS ENGINE = ReplacingMergeTree ORDER BY metric_family_name
+METRICS INNER UUID '01234567-89ab-cdef-0123-456789abcdef'
+```
+
+So the columns were generated automatically and also there are three inner UUIDs in this statement -
+one per each inner target table that was created.
+(Inner UUIDs are not shown normally until setting
+[show_table_uuid_in_table_create_query_if_not_nil](../../../operations/settings/settings#show_table_uuid_in_table_create_query_if_not_nil)
+is set.)
+
+Inner target tables have names like `.inner_id.data.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`,
+`.inner_id.tags.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, `.inner_id.metrics.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
+and each target table has columns which is a subset of the columns of the main `TimeSeries` table:
+
+``` sql
+CREATE TABLE default.`.inner_id.data.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
+(
+    `id` UUID,
+    `timestamp` DateTime64(3),
+    `value` Float64
+)
+ENGINE = MergeTree
+ORDER BY (id, timestamp)
+```
+
+``` sql
+CREATE TABLE default.`.inner_id.tags.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
+(
+    `id` UUID DEFAULT reinterpretAsUUID(sipHash128(metric_name, all_tags)),
+    `metric_name` LowCardinality(String),
+    `tags` Map(LowCardinality(String), String),
+    `all_tags` Map(String, String) EPHEMERAL,
+    `min_time` SimpleAggregateFunction(min, Nullable(DateTime64(3))),
+    `max_time` SimpleAggregateFunction(max, Nullable(DateTime64(3)))
+)
+ENGINE = AggregatingMergeTree
+PRIMARY KEY metric_name
+ORDER BY (metric_name, id)
+```
+
+``` sql
+CREATE TABLE default.`.inner_id.metrics.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
+(
+    `metric_family_name` String,
+    `type` String,
+    `unit` String,
+    `help` String
+)
+ENGINE = ReplacingMergeTree
+ORDER BY metric_family_name
+```
+
+## Adjusting types of columns {#adjusting-column-types}
+
+You can adjust the types of almost any column of the inner target tables by specifying them explicitly
+while defining the main table. For example,
+
+``` sql
+CREATE TABLE my_table
+(
+    timestamp DateTime64(6)
+) ENGINE=TimeSeries
+```
+
+will make the inner [data]{#data-table} table store timestamp in microseconds instead of milliseconds:
+
+``` sql
+CREATE TABLE default.`.inner_id.data.xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
+(
+    `id` UUID,
+    `timestamp` DateTime64(6),
+    `value` Float64
+)
+ENGINE = MergeTree
+ORDER BY (id, timestamp)
+```
+
+## The `id` column {#id-column}
+
+The `id` column contains identifiers, every identifier is calculated for a combination of a metric name and tags.
+The DEFAULT expression for the `id` column is an expression which will be used to calculate such identifiers.
+Both the type of the `id` column and that expression can be adjusted by specifying them explicitly:
+
+``` sql
+CREATE TABLE my_table
+(
+    id UInt64 DEFAULT sipHash64(metric_name, all_tags)
+) ENGINE=TimeSeries
+```
+
+## The `tags` and `all_tags` columns {#tags-and-all-tags}
+
+There are two columns containing maps of tags - `tags` and `all_tags`. In this example they mean the same, however they can be different
+if setting `tags_to_columns` is used. This setting allows to specify that a specific tag should be stored in a separate column instead of storing
+in a map inside the `tags` column:
+
+``` sql
+CREATE TABLE my_table ENGINE=TimeSeries SETTINGS = {'instance': 'instance', 'job': 'job'}
+```
+
+This statement will add columns
+```
+    `instance` String,
+    `job` String
+```
+to the definition of both `my_table` and its inner [tags]{#tags-table} target table. In this case the `tags` column will not contain tags `instance` and `job`,
+but the `all_tags` column will contain them. The `all_tags` column is ephemeral and its only purpose to be used in the DEFAULT expression
+for the `id` column.
+
+The types of columns can be adjusted by specifying them explicitly:
+
+``` sql
+CREATE TABLE my_table (instance LowCardinality(String), job LowCardinality(Nullable(String)))
+ENGINE=TimeSeries SETTINGS = {'instance': 'instance', 'job': 'job'}
+```
+
+## Table engines of inner target tables {#inner-table-engines}
+
+By default inner target tables use the following table engines:
+- the [data]{#data-table} table uses [MergeTree](../mergetree-family/mergetree);
+- the [tags]{#tags-table} table uses [AggregatingMergeTree](../mergetree-family/aggregatingmergetree) because the same data is often inserted multiple times to this table so we need a way
+to remove duplicates, and also because it's required to do aggregation for columns `min_time` and `max_time`;
+- the [metrics]{#metrics-table} table uses [ReplacingMergeTree](../mergetree-family/replacingmergetree) because the same data is often inserted multiple times to this table so we need a way
+to remove duplicates.
+
+Other table engines also can be used for inner target tables if it's specified so:
+
+``` sql
+CREATE TABLE my_table ENGINE=TimeSeries
+DATA ENGINE=ReplicatedMergeTree
+TAGS ENGINE=ReplicatedAggregatingMergeTree
+METRICS ENGINE=ReplicatedReplacingMergeTree
+```
+
+## External target tables {#external-target-tables}
+
+It's possible to make a `TimeSeries` table use a manually created table:
+
+``` sql
+CREATE TABLE data_for_my_table
+(
+    `id` UUID,
+    `timestamp` DateTime64(3),
+    `value` Float64
+)
+ENGINE = MergeTree
+ORDER BY (id, timestamp);
+
+CREATE TABLE tags_for_my_table ...
+
+CREATE TABLE metrics_for_my_table ...
+
+CREATE TABLE my_table ENGINE=TimeSeries DATA data_for_my_table TAGS tags_for_my_table METRICS metrics_for_my_table;
+```
+
+## Settings {#settings}
+
+Here is a list of settings which can be specified while defining a `TimeSeries` table:
+
+| Name | Type | Default | Description |
+|---|---|---|---|
+| `tags_to_columns` | Map | {} | Map specifying which tags should be put to separate columns in the [tags]{#tags-table} table. Syntax: `{'tag1': 'column1', 'tag2' : column2, ...}` |
+| `use_all_tags_column_to_generate_id` | Bool | true | When generating an expression to calculate an identifier of a time series, this flag enables using the `all_tags` column in that calculation |
+| `store_min_time_and_max_time` | Bool | true | If set to true then the table will store `min_time` and `max_time` for each time series |
+| `aggregate_min_time_and_max_time` | Bool | true | When creating an inner target `tags` table, this flag enables using `SimpleAggregateFunction(min, Nullable(DateTime64(3)))` instead of just `Nullable(DateTime64(3))` as the type of the `min_time` column, and the same for the `max_time` column |
+| `filter_by_min_time_and_max_time` | Bool | true | If set to true then the table will use the `min_time` and `max_time` columns for filtering time series |
+
+# Functions {#functions}
+
+Here is a list of functions supporting a `TimeSeries` table as an argument:
+- [timeSeriesData](../../../sql-reference/table-functions/timeSeriesData.md)
+- [timeSeriesTags](../../../sql-reference/table-functions/timeSeriesTags.md)
+- [timeSeriesMetrics](../../../sql-reference/table-functions/timeSeriesMetrics.md)
diff --git a/docs/en/interfaces/prometheus.md b/docs/en/interfaces/prometheus.md
new file mode 100644
index 00000000000..75a68c59219
--- /dev/null
+++ b/docs/en/interfaces/prometheus.md
@@ -0,0 +1,160 @@
+---
+slug: /en/interfaces/prometheus
+sidebar_position: 19
+sidebar_label: Prometheus protocols
+---
+
+# Prometheus protocols
+
+## Exposing metrics {#expose}
+
+:::note
+ClickHouse Cloud does not currently support connecting to Prometheus. To be notified when this feature is supported, please contact support@clickhouse.com.
+:::
+
+ClickHouse can expose its own metrics for scraping from Prometheus:
+
+```xml
+<prometheus>
+    <port>9363</port>
+    <endpoint>/metrics</endpoint>
+    <metrics>true</metrics>
+    <asynchronous_metrics>true</asynchronous_metrics>
+    <events>true</events>
+    <errors>true</errors>
+</prometheus>
+
+Section `<prometheus.handlers>` can be used to make more extended handlers.
+This section is similar to [<http_handlers>](http.md) but works for prometheus protocols:
+
+```xml
+<prometheus>
+    <port>9363</port>
+    <handlers>
+        <my_rule_1>
+            <url>/metrics</url>
+            <handler>
+                <type>expose_metrics</type>
+                <metrics>true</metrics>
+                <asynchronous_metrics>true</asynchronous_metrics>
+                <events>true</events>
+                <errors>true</errors>
+            </handler>
+        </my_rule_1>
+    </handlers>
+</prometheus>
+```
+
+Settings:
+
+| Name | Default | Description |
+|---|---|---|---|
+| `port` | none | Port for serving the exposing metrics protocol. |
+| `endpoint` | `/metrics` | HTTP endpoint for scraping metrics by prometheus server. Starts with `/`. Should not be used with the `<handlers>` section. |
+| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](http.md) section. |
+| `metrics` | true | Expose metrics from the [system.metrics](../operations/system-tables/metrics.md) table. |
+| `asynchronous_metrics` | true | Expose current metrics values from the [system.asynchronous_metrics](../operations/system-tables/asynchronous_metrics.md) table. |
+| `events` | true | Expose metrics from the [system.events](../operations/system-tables/events.md) table. |
+| `errors` | true | Expose the number of errors by error codes occurred since the last server restart. This information could be obtained from the [system.errors](../operations/system-tables/errors.md) as well. |
+
+Check (replace `127.0.0.1` with the IP addr or hostname of your ClickHouse server):
+```bash
+curl 127.0.0.1:9363/metrics
+```
+
+## Remote-write protocol {#remote-write}
+
+ClickHouse supports the [remote-write](https://prometheus.io/docs/specs/remote_write_spec/) protocol.
+Data are received by this protocol and written to a [TimeSeries](../engines/table-engines/integrations/time-series.md) table
+(which should be created beforehand).
+
+```xml
+<prometheus>
+    <port>9363</port>
+    <handlers>
+        <my_rule_1>
+            <url>/write</url>
+            <handler>
+                <type>remote_write</type
+                <database>db_name</database>
+                <table>time_series_table</table>
+            </handler>
+        </my_rule_1>
+    </handlers>
+</prometheus>
+```
+
+Settings:
+
+| Name | Default | Description |
+|---|---|---|---|
+| `port` | none | Port for serving the `remote-write` protocol. |
+| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](http.md) section. |
+| `table` | none | The name of a [TimeSeries](../engines/table-engines/integrations/time-series.md) table to write data received by the `remote-write` protocol. This name can optionally contain the name of a database too. |
+| `database` | none | The name of a database where the table specified in the `table` setting is located if it's not specified in the `table` setting. |
+
+## Remote-read protocol {#remote-read}
+
+ClickHouse supports the [remote-read](https://prometheus.io/docs/prometheus/latest/querying/remote_read_api/) protocol.
+Data are read from a [TimeSeries](../engines/table-engines/integrations/time-series.md) table and sent via this protocol.
+
+```xml
+<prometheus>
+    <port>9363</port>
+    <handlers>
+        <my_rule_1>
+            <url>/read</url>
+            <handler>
+                <type>remote_read</type
+                <database>db_name</database>
+                <table>time_series_table</table>
+            </handler>
+        </my_rule_1>
+    </handlers>
+</prometheus>
+```
+
+Settings:
+
+| Name | Default | Description |
+|---|---|---|---|
+| `port` | none | Port for serving the `remote-read` protocol. |
+| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](http.md) section. |
+| `table` | none | The name of a [TimeSeries](../engines/table-engines/integrations/time-series.md) table to read data to send by the `remote-read` protocol. This name can optionally contain the name of a database too. |
+| `database` | none | The name of a database where the table specified in the `table` setting is located if it's not specified in the `table` setting. |
+
+## Configuration for multiple protocols {#multiple-protocols}
+
+Multiple protocols can be specified together in one place:
+
+```xml
+<prometheus>
+    <port>9363</port>
+    <handlers>
+        <my_rule_1>
+            <url>/metrics</url>
+            <handler>
+                <type>expose_metrics</type>
+                <metrics>true</metrics>
+                <asynchronous_metrics>true</asynchronous_metrics>
+                <events>true</events>
+                <errors>true</errors>
+            </handler>
+        </my_rule_1>
+        <my_rule_2>
+            <url>/write</url>
+            <handler>
+                <type>remote_write</type
+                <table>db_name.time_series_table</table>
+            </handler>
+        </my_rule_2>
+        <my_rule_3>
+            <url>/read</url>
+            <handler>
+                <type>remote_read</type
+                <table>db_name.time_series_table</table>
+            </handler>
+        </my_rule_3>
+    </handlers>
+</prometheus>
+```
diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index a1e3c292b04..68f61650e00 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -2112,48 +2112,6 @@ The trailing slash is mandatory.
 <path>/var/lib/clickhouse/</path>
 ```
 
-## Prometheus {#prometheus}
-
-:::note
-ClickHouse Cloud does not currently support connecting to Prometheus. To be notified when this feature is supported, please contact support@clickhouse.com.
-:::
-
-Exposing metrics data for scraping from [Prometheus](https://prometheus.io).
-
-Settings:
-
-- `endpoint` – HTTP endpoint for scraping metrics by prometheus server. Start from ‘/’.
-- `port` – Port for `endpoint`.
-- `metrics` – Expose metrics from the [system.metrics](../../operations/system-tables/metrics.md#system_tables-metrics) table.
-- `events` – Expose metrics from the [system.events](../../operations/system-tables/events.md#system_tables-events) table.
-- `asynchronous_metrics` – Expose current metrics values from the [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md#system_tables-asynchronous_metrics) table.
-- `errors` - Expose the number of errors by error codes occurred since the last server restart. This information could be obtained from the [system.errors](../../operations/system-tables/asynchronous_metrics.md#system_tables-errors) as well.
-
-**Example**
-
-``` xml
-<clickhouse>
-    <listen_host>0.0.0.0</listen_host>
-    <http_port>8123</http_port>
-    <tcp_port>9000</tcp_port>
-    <!-- highlight-start -->
-    <prometheus>
-        <endpoint>/metrics</endpoint>
-        <port>9363</port>
-        <metrics>true</metrics>
-        <events>true</events>
-        <asynchronous_metrics>true</asynchronous_metrics>
-        <errors>true</errors>
-    </prometheus>
-    <!-- highlight-end -->
-</clickhouse>
-```
-
-Check (replace `127.0.0.1` with the IP addr or hostname of your ClickHouse server):
-```bash
-curl 127.0.0.1:9363/metrics
-```
-
 ## query_log {#query-log}
 
 Setting for logging queries received with the [log_queries=1](../../operations/settings/settings.md) setting.
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 35547c3a9a6..feac12f9c99 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -5614,3 +5614,14 @@ Default value: `1GiB`.
 Disable all insert and mutations (alter table update / alter table delete / alter table drop partition). Set to true, can make this node focus on reading queries.
 
 Default value: `false`.
+
+## allow_experimental_time_series_table {#allow-experimental-time-series-table}
+
+Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine.
+
+Possible values:
+
+- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled.
+- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
+
+Default value: `0`.
diff --git a/docs/en/sql-reference/table-functions/timeSeriesData.md b/docs/en/sql-reference/table-functions/timeSeriesData.md
new file mode 100644
index 00000000000..aa7a9d30c2a
--- /dev/null
+++ b/docs/en/sql-reference/table-functions/timeSeriesData.md
@@ -0,0 +1,28 @@
+---
+slug: /en/sql-reference/table-functions/timeSeriesData
+sidebar_position: 145
+sidebar_label: timeSeriesData
+---
+
+# timeSeriesData
+
+`timeSeriesData(db_name.time_series_table)` - Returns the [data](../../engines/table-engines/integrations/time-series.md#data-table) table
+used by table `db_name.time_series_table` which table engine is [TimeSeries](../../engines/table-engines/integrations/time-series.md):
+
+``` sql
+CREATE TABLE db_name.time_series_table ENGINE=TimeSeries DATA data_table
+```
+
+The function also works if the _data_ table is inner:
+
+``` sql
+CREATE TABLE db_name.time_series_table ENGINE=TimeSeries DATA INNER UUID '01234567-89ab-cdef-0123-456789abcdef'
+```
+
+The following queries are equivalent:
+
+``` sql
+SELECT * FROM timeSeriesData(db_name.time_series_table);
+SELECT * FROM timeSeriesData('db_name.time_series_table');
+SELECT * FROM timeSeriesData('db_name', 'time_series_table');
+```
diff --git a/docs/en/sql-reference/table-functions/timeSeriesMetrics.md b/docs/en/sql-reference/table-functions/timeSeriesMetrics.md
new file mode 100644
index 00000000000..913f1185bca
--- /dev/null
+++ b/docs/en/sql-reference/table-functions/timeSeriesMetrics.md
@@ -0,0 +1,28 @@
+---
+slug: /en/sql-reference/table-functions/timeSeriesMetrics
+sidebar_position: 145
+sidebar_label: timeSeriesMetrics
+---
+
+# timeSeriesMetrics
+
+`timeSeriesMetrics(db_name.time_series_table)` - Returns the [metrics](../../engines/table-engines/integrations/time-series.md#metrics-table) table
+used by table `db_name.time_series_table` which table engine is [TimeSeries](../../engines/table-engines/integrations/time-series.md):
+
+``` sql
+CREATE TABLE db_name.time_series_table ENGINE=TimeSeries METRICS metrics_table
+```
+
+The function also works if the _metrics_ table is inner:
+
+``` sql
+CREATE TABLE db_name.time_series_table ENGINE=TimeSeries METRICS INNER UUID '01234567-89ab-cdef-0123-456789abcdef'
+```
+
+The following queries are equivalent:
+
+``` sql
+SELECT * FROM timeSeriesMetrics(db_name.time_series_table);
+SELECT * FROM timeSeriesMetrics('db_name.time_series_table');
+SELECT * FROM timeSeriesMetrics('db_name', 'time_series_table');
+```
diff --git a/docs/en/sql-reference/table-functions/timeSeriesTags.md b/docs/en/sql-reference/table-functions/timeSeriesTags.md
new file mode 100644
index 00000000000..663a7dc6ac8
--- /dev/null
+++ b/docs/en/sql-reference/table-functions/timeSeriesTags.md
@@ -0,0 +1,28 @@
+---
+slug: /en/sql-reference/table-functions/timeSeriesTags
+sidebar_position: 145
+sidebar_label: timeSeriesTags
+---
+
+# timeSeriesTags
+
+`timeSeriesTags(db_name.time_series_table)` - Returns the [tags](../../engines/table-engines/integrations/time-series.md#tags-table) table
+used by table `db_name.time_series_table` which table engine is [TimeSeries](../../engines/table-engines/integrations/time-series.md):
+
+``` sql
+CREATE TABLE db_name.time_series_table ENGINE=TimeSeries TAGS tags_table
+```
+
+The function also works if the _tags_ table is inner:
+
+``` sql
+CREATE TABLE db_name.time_series_table ENGINE=TimeSeries TAGS INNER UUID '01234567-89ab-cdef-0123-456789abcdef'
+```
+
+The following queries are equivalent:
+
+``` sql
+SELECT * FROM timeSeriesTags(db_name.time_series_table);
+SELECT * FROM timeSeriesTags('db_name.time_series_table');
+SELECT * FROM timeSeriesTags('db_name', 'time_series_table');
+```
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index 8a9a8d2e76c..382e64f343c 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -1672,6 +1672,7 @@ fuzzQuery
 fuzzer
 fuzzers
 gRPC
+gaugehistogram
 gccMurmurHash
 gcem
 generateRandom
@@ -2556,6 +2557,7 @@ startsWithUTF
 startswith
 statbox
 stateful
+stateset
 stddev
 stddevPop
 stddevPopStable
@@ -2687,6 +2689,10 @@ themself
 threadpool
 throwIf
 timeDiff
+TimeSeries
+timeSeriesData
+timeSeriesMetrics
+timeSeriesTags
 timeSlot
 timeSlots
 timeZone

From c4fda6cd4c5b7cfa40792c742eab98aa1857fd7d Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 6 Aug 2024 18:26:22 +0000
Subject: [PATCH 28/56] Fix style

---
 src/Storages/Statistics/StatisticsTDigest.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Storages/Statistics/StatisticsTDigest.cpp b/src/Storages/Statistics/StatisticsTDigest.cpp
index b0c4bfda27d..fd9b922ffc8 100644
--- a/src/Storages/Statistics/StatisticsTDigest.cpp
+++ b/src/Storages/Statistics/StatisticsTDigest.cpp
@@ -9,7 +9,6 @@ namespace DB
 namespace ErrorCodes
 {
 extern const int ILLEGAL_STATISTICS;
-extern const int LOGICAL_ERROR;
 }
 
 StatisticsTDigest::StatisticsTDigest(const SingleStatisticsDescription & description, const DataTypePtr & data_type_)

From 45b55c4d6ed5cb5f023855ac78f4097dafba7fec Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 22:25:08 +0200
Subject: [PATCH 29/56] Update comment

---
 CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d862b23e3a..2e4be09f5d3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -430,7 +430,10 @@ endif()
 
 if (NOT OS_ANDROID AND OS_LINUX AND NOT ARCH_S390X AND NOT SANITIZE)
     # Slightly more efficient code can be generated
-    # Disabled for Android, because otherwise ClickHouse cannot run on Android.
+    # Using '-no-pie' builds executables with fixed addresses, resulting in slightly more efficient code
+    # and keeping binary addresses constant even with ASLR enabled.
+    # Disabled on Android as it requires PIE: https://source.android.com/docs/security/enhancements#android-5
+    # Disabled on IBM S390X due to build issues with 'no-pie'
     set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-pie")
     set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fno-pie")
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -no-pie -Wl,-no-pie")

From db2d732b2b6ee7e61a2fe5c644db7ccb718e9f0c Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 22:25:47 +0200
Subject: [PATCH 30/56] Update comment2

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e4be09f5d3..afab666a733 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -429,7 +429,6 @@ if (NOT SANITIZE)
 endif()
 
 if (NOT OS_ANDROID AND OS_LINUX AND NOT ARCH_S390X AND NOT SANITIZE)
-    # Slightly more efficient code can be generated
     # Using '-no-pie' builds executables with fixed addresses, resulting in slightly more efficient code
     # and keeping binary addresses constant even with ASLR enabled.
     # Disabled on Android as it requires PIE: https://source.android.com/docs/security/enhancements#android-5

From 340a2bcd2582c563c0f2eaeda0da1f32269b5253 Mon Sep 17 00:00:00 2001
From: filimonov <1549571+filimonov@users.noreply.github.com>
Date: Tue, 6 Aug 2024 22:36:16 +0200
Subject: [PATCH 31/56] Update comment3

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index afab666a733..7b4e0484ab1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -433,6 +433,7 @@ if (NOT OS_ANDROID AND OS_LINUX AND NOT ARCH_S390X AND NOT SANITIZE)
     # and keeping binary addresses constant even with ASLR enabled.
     # Disabled on Android as it requires PIE: https://source.android.com/docs/security/enhancements#android-5
     # Disabled on IBM S390X due to build issues with 'no-pie'
+    # Disabled with sanitizers to avoid issues with maximum relocation size: https://github.com/ClickHouse/ClickHouse/pull/49145
     set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fno-pie")
     set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -fno-pie")
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -no-pie -Wl,-no-pie")

From 20c2d346a5db550d38954b5c5b1de2d1a09a884c Mon Sep 17 00:00:00 2001
From: jsc0218 <jsc0218@gmail.com>
Date: Wed, 7 Aug 2024 00:35:25 +0000
Subject: [PATCH 32/56] just add test

---
 .../MergeTree/MergeTreeIndexBloomFilter.cpp   | 26 +------------------
 .../00945_bloom_filter_index.reference        |  2 ++
 .../0_stateless/00945_bloom_filter_index.sql  | 11 ++++++++
 ..._bloom_filter_not_supported_func.reference |  2 --
 .../03215_bloom_filter_not_supported_func.sql | 14 ----------
 5 files changed, 14 insertions(+), 41 deletions(-)
 delete mode 100644 tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference
 delete mode 100644 tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql

diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
index 0a4eda3be69..dc314ce53d4 100644
--- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
@@ -366,31 +366,7 @@ bool MergeTreeIndexConditionBloomFilter::extractAtomFromTree(const RPNBuilderTre
         }
     }
 
-    if (node.isFunction())
-    {
-        /// Similar to the logic of KeyCondition, restrict the usage of bloom filter, in case of func like cast(c=1 or c=9999 as Bool).
-        const std::unordered_set<String> atom_map
-        {
-            "equals",
-            "notEquals",
-            "has",
-            "mapContains",
-            "indexOf",
-            "hasAny",
-            "hasAll",
-            "in",
-            "notIn",
-            "globalIn",
-            "globalNotIn"
-        };
-
-        auto func_name = node.toFunctionNode().getFunctionName();
-        if (atom_map.find(func_name) == std::end(atom_map))
-            return false;
-    }
-
-    bool res = traverseFunction(node, out, nullptr /*parent*/);
-    return res;
+    return traverseFunction(node, out, nullptr /*parent*/);
 }
 
 bool MergeTreeIndexConditionBloomFilter::traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent)
diff --git a/tests/queries/0_stateless/00945_bloom_filter_index.reference b/tests/queries/0_stateless/00945_bloom_filter_index.reference
index e6751fe4762..9d9b49b29c9 100644
--- a/tests/queries/0_stateless/00945_bloom_filter_index.reference
+++ b/tests/queries/0_stateless/00945_bloom_filter_index.reference
@@ -227,3 +227,5 @@
 1	value1
 1	value2
 2	value3
+1
+1
diff --git a/tests/queries/0_stateless/00945_bloom_filter_index.sql b/tests/queries/0_stateless/00945_bloom_filter_index.sql
index 2b7feacbd98..71109df79e7 100644
--- a/tests/queries/0_stateless/00945_bloom_filter_index.sql
+++ b/tests/queries/0_stateless/00945_bloom_filter_index.sql
@@ -374,3 +374,14 @@ SELECT id, ary[indexOf(ary, 'value2')] FROM test_bf_indexOf WHERE ary[indexOf(ar
 SELECT id, ary[indexOf(ary, 'value3')] FROM test_bf_indexOf WHERE ary[indexOf(ary, 'value3')] = 'value3' ORDER BY id FORMAT TSV;
 
 DROP TABLE IF EXISTS test_bf_indexOf;
+
+-- expecting cast function to be unknown
+DROP TABLE IF EXISTS test_bf_cast;
+
+CREATE TABLE test_bf_cast (c Int32, INDEX x1 (c) type bloom_filter) ENGINE = MergeTree ORDER BY c as select 1;
+
+SELECT count() FROM test_bf_cast WHERE cast(c=1 or c=9999 as Bool) settings use_skip_indexes=0;
+
+SELECT count() FROM test_bf_cast WHERE cast(c=1 or c=9999 as Bool) settings use_skip_indexes=1;
+
+DROP TABLE test_bf_cast;
\ No newline at end of file
diff --git a/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference b/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference
deleted file mode 100644
index 6ed281c757a..00000000000
--- a/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.reference
+++ /dev/null
@@ -1,2 +0,0 @@
-1
-1
diff --git a/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql b/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql
deleted file mode 100644
index 3d094244892..00000000000
--- a/tests/queries/0_stateless/03215_bloom_filter_not_supported_func.sql
+++ /dev/null
@@ -1,14 +0,0 @@
-drop table if exists t;
-
-create table t ( 
-  c Int32,
-  index x1 (c) type bloom_filter
-) engine=MergeTree order by c as select 1;
-
-SELECT count() FROM t WHERE cast(c=1 or c=9999 as Bool) 
-settings use_skip_indexes=0;
-
-SELECT count() FROM t WHERE cast(c=1 or c=9999 as Bool) 
-settings use_skip_indexes=1;
-
-drop table t;
\ No newline at end of file

From 25f557667a90f9805bc22796a8c799f3203019fa Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 7 Aug 2024 02:47:53 +0200
Subject: [PATCH 33/56] Change log level in clickhouse-local

---
 src/Client/LocalConnection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp
index 072184e0a66..7595a29912b 100644
--- a/src/Client/LocalConnection.cpp
+++ b/src/Client/LocalConnection.cpp
@@ -365,7 +365,7 @@ bool LocalConnection::poll(size_t)
         {
             while (pollImpl())
             {
-                LOG_DEBUG(&Poco::Logger::get("LocalConnection"), "Executor timeout encountered, will retry");
+                LOG_TEST(&Poco::Logger::get("LocalConnection"), "Executor timeout encountered, will retry");
 
                 if (needSendProgressOrMetrics())
                     return true;

From 2a5a8f15f4ab5d4322fb09d41ca1e8279197abe4 Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Tue, 6 Aug 2024 20:51:44 -0400
Subject: [PATCH 34/56] Add an explicit error for `ALTER MODIFY SQL SECURITY`
 on non-view tables.

---
 src/Storages/MergeTree/MergeTreeData.cpp                      | 4 ++++
 .../02884_create_view_with_sql_security_option.reference      | 1 +
 .../0_stateless/02884_create_view_with_sql_security_option.sh | 2 ++
 3 files changed, 7 insertions(+)

diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 49888596fbb..5d63f6c94d3 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -3348,6 +3348,10 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
             throw Exception(ErrorCodes::NOT_IMPLEMENTED,
                             "ALTER MODIFY REFRESH is not supported by MergeTree engines family");
 
+        if (command.type == AlterCommand::MODIFY_SQL_SECURITY)
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+                            "ALTER MODIFY SQL SECURITY is not supported by MergeTree engines family");
+
         if (command.type == AlterCommand::MODIFY_ORDER_BY && !is_custom_partitioned)
         {
             throw Exception(ErrorCodes::BAD_ARGUMENTS,
diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference
index a03343c8cb3..39e7aad87e0 100644
--- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference
+++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.reference
@@ -27,6 +27,7 @@ OK
 OK
 100
 100
+OK
 ===== TestGrants =====
 OK
 OK
diff --git a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh
index cc4e76a9ed9..fadbbff7f34 100755
--- a/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh
+++ b/tests/queries/0_stateless/02884_create_view_with_sql_security_option.sh
@@ -199,6 +199,8 @@ ${CLICKHOUSE_CLIENT} --user $user2 --query "INSERT INTO source SELECT * FROM gen
 ${CLICKHOUSE_CLIENT} --query "SELECT count() FROM destination1"
 ${CLICKHOUSE_CLIENT} --query "SELECT count() FROM destination2"
 
+(( $(${CLICKHOUSE_CLIENT} --query "ALTER TABLE test_table MODIFY SQL SECURITY INVOKER" 2>&1 | grep -c "is not supported") >= 1 )) && echo "OK" || echo "UNEXPECTED"
+
 echo "===== TestGrants ====="
 ${CLICKHOUSE_CLIENT} --query "GRANT CREATE ON *.* TO $user1"
 ${CLICKHOUSE_CLIENT} --query "GRANT SELECT ON $db.test_table TO $user1, $user2"

From c933a38955e288afbef5c246fed9640878b0a68f Mon Sep 17 00:00:00 2001
From: khodyrevyurii <khodyrevyurii@gmail.com>
Date: Tue, 6 Aug 2024 22:53:24 +0500
Subject: [PATCH 35/56] change std::thread::hardware_concurrency on container
 friendly method getNumberOfPhysicalCPUCores

---
 programs/server/Server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 7800ee9ff00..46bbc235fee 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1623,7 +1623,7 @@ try
                 concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num;
             if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0)
             {
-                auto value = new_server_settings.concurrent_threads_soft_limit_ratio_to_cores * std::thread::hardware_concurrency();
+                auto value = new_server_settings.concurrent_threads_soft_limit_ratio_to_cores * getNumberOfPhysicalCPUCores();
                 if (value > 0 && value < concurrent_threads_soft_limit)
                     concurrent_threads_soft_limit = value;
             }

From 00b62b1c0dccd16e45cef445cc8bf717b2da6486 Mon Sep 17 00:00:00 2001
From: khodyrevyurii <khodyrevyurii@gmail.com>
Date: Wed, 7 Aug 2024 01:26:25 +0500
Subject: [PATCH 36/56] Minor clarifycation for method
 getNumberOfPhysicalCPUCores

---
 programs/server/Server.cpp               | 7 ++++---
 src/Common/getNumberOfPhysicalCPUCores.h | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 46bbc235fee..618bd2b011c 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -814,10 +814,11 @@ try
 
     const size_t physical_server_memory = getMemoryAmount();
 
-    LOG_INFO(log, "Available RAM: {}; physical cores: {}; logical cores: {}.",
+    LOG_INFO(log, "Available RAM: {}; logical cores: {}; used cores: {}.",
         formatReadableSizeWithBinarySuffix(physical_server_memory),
-        getNumberOfPhysicalCPUCores(),  // on ARM processors it can show only enabled at current moment cores
-        std::thread::hardware_concurrency());
+        std::thread::hardware_concurrency(),
+        getNumberOfPhysicalCPUCores()  // on ARM processors it can show only enabled at current moment cores
+        );
 
 #if defined(__x86_64__)
     String cpu_info;
diff --git a/src/Common/getNumberOfPhysicalCPUCores.h b/src/Common/getNumberOfPhysicalCPUCores.h
index 827e95e1bea..9e3412fdcba 100644
--- a/src/Common/getNumberOfPhysicalCPUCores.h
+++ b/src/Common/getNumberOfPhysicalCPUCores.h
@@ -1,4 +1,5 @@
 #pragma once
 
 /// Get number of CPU cores without hyper-threading.
+/// The calculation respects possible cgroups limits.
 unsigned getNumberOfPhysicalCPUCores();

From a9c284dd8efb439ff06cf0f95e2a9920a26fdf5d Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 7 Aug 2024 10:07:27 +0000
Subject: [PATCH 37/56] Include fixes aafe498 and cfaa852

---
 src/Storages/Statistics/Statistics.cpp        | 30 +++++++++++++++++--
 src/Storages/Statistics/Statistics.h          |  6 ++++
 src/Storages/Statistics/StatisticsTDigest.cpp | 18 ++++-------
 .../02864_statistics_bugs.reference           |  2 ++
 .../0_stateless/02864_statistics_bugs.sql     | 18 +++++++++++
 5 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp
index 52eec437ac2..fd686c5f0aa 100644
--- a/src/Storages/Statistics/Statistics.cpp
+++ b/src/Storages/Statistics/Statistics.cpp
@@ -1,14 +1,17 @@
 #include <Storages/Statistics/Statistics.h>
+#include <Common/Exception.h>
+#include <Common/FieldVisitorConvertToNumber.h>
+#include <Common/logger_useful.h>
+#include <DataTypes/DataTypeFactory.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>
+#include <Interpreters/convertFieldToType.h>
 #include <Storages/ColumnsDescription.h>
 #include <Storages/Statistics/ConditionSelectivityEstimator.h>
 #include <Storages/Statistics/StatisticsCountMinSketch.h>
 #include <Storages/Statistics/StatisticsTDigest.h>
 #include <Storages/Statistics/StatisticsUniq.h>
 #include <Storages/StatisticsDescription.h>
-#include <Common/Exception.h>
-#include <Common/logger_useful.h>
 
 
 #include "config.h" /// USE_DATASKETCHES
@@ -27,6 +30,29 @@ enum StatisticsFileVersion : UInt16
     V0 = 0,
 };
 
+std::optional<Float64> StatisticsUtils::tryConvertToFloat64(const Field & value, const DataTypePtr & data_type)
+{
+    if (data_type->isValueRepresentedByNumber())
+    {
+        Field value_converted;
+
+        if (isInteger(data_type) && (value.getType() == Field::Types::Float64 || value.getType() == Field::Types::String))
+            /// For case val_int32 < 10.5 or val_int32 < '10.5' we should convert 10.5 to Float64.
+            value_converted = convertFieldToType(value, *DataTypeFactory::instance().get("Float64"));
+        else
+            /// We should convert value to the real column data type and then translate it to Float64.
+            /// For example for expression col_date > '2024-08-07', if we directly convert '2024-08-07' to Float64, we will get null.
+            value_converted = convertFieldToType(value, *data_type);
+
+        if (value_converted.isNull())
+            return {};
+
+        Float64 value_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), value_converted);
+        return value_as_float;
+    }
+    return {};
+}
+
 IStatistics::IStatistics(const SingleStatisticsDescription & stat_)
     : stat(stat_)
 {
diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h
index 593ac20edb5..2a30c0de315 100644
--- a/src/Storages/Statistics/Statistics.h
+++ b/src/Storages/Statistics/Statistics.h
@@ -14,6 +14,12 @@ namespace DB
 constexpr auto STATS_FILE_PREFIX = "statistics_";
 constexpr auto STATS_FILE_SUFFIX = ".stats";
 
+struct StatisticsUtils
+{
+    /// Returns std::nullopt if input Field cannot be converted to a concrete value
+    /// - `data_type` is the type of the column on which the statistics object was build on
+    static std::optional<Float64> tryConvertToFloat64(const Field & value, const DataTypePtr & data_type);
+};
 
 /// Statistics describe properties of the values in the column,
 /// e.g. how many unique values exist,
diff --git a/src/Storages/Statistics/StatisticsTDigest.cpp b/src/Storages/Statistics/StatisticsTDigest.cpp
index fd9b922ffc8..285b779036f 100644
--- a/src/Storages/Statistics/StatisticsTDigest.cpp
+++ b/src/Storages/Statistics/StatisticsTDigest.cpp
@@ -1,8 +1,6 @@
 #include <Storages/Statistics/StatisticsTDigest.h>
-#include <Common/FieldVisitorConvertToNumber.h>
 #include <DataTypes/DataTypeLowCardinality.h>
 #include <DataTypes/DataTypeNullable.h>
-#include <Interpreters/convertFieldToType.h>
 
 namespace DB
 {
@@ -41,22 +39,18 @@ void StatisticsTDigest::deserialize(ReadBuffer & buf)
 
 Float64 StatisticsTDigest::estimateLess(const Field & val) const
 {
-    Field val_converted = convertFieldToType(val, *data_type);
-    if (val_converted.isNull())
+    auto val_as_float = StatisticsUtils::tryConvertToFloat64(val, data_type);
+    if (!val_as_float.has_value())
         return 0;
-
-    auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
-    return t_digest.getCountLessThan(val_as_float);
+    return t_digest.getCountLessThan(*val_as_float);
 }
 
 Float64 StatisticsTDigest::estimateEqual(const Field & val) const
 {
-    Field val_converted = convertFieldToType(val, *data_type);
-    if (val_converted.isNull())
+    auto val_as_float = StatisticsUtils::tryConvertToFloat64(val, data_type);
+    if (!val_as_float.has_value())
         return 0;
-
-    auto val_as_float = applyVisitor(FieldVisitorConvertToNumber<Float64>(), val_converted);
-    return t_digest.getCountEqual(val_as_float);
+    return t_digest.getCountEqual(*val_as_float);
 }
 
 void tdigestStatisticsValidator(const SingleStatisticsDescription & /*description*/, const DataTypePtr & data_type)
diff --git a/tests/queries/0_stateless/02864_statistics_bugs.reference b/tests/queries/0_stateless/02864_statistics_bugs.reference
index f599e28b8ab..a7eeae9def6 100644
--- a/tests/queries/0_stateless/02864_statistics_bugs.reference
+++ b/tests/queries/0_stateless/02864_statistics_bugs.reference
@@ -1 +1,3 @@
 10
+11
+0
diff --git a/tests/queries/0_stateless/02864_statistics_bugs.sql b/tests/queries/0_stateless/02864_statistics_bugs.sql
index ef1735550e6..01bbe221b0f 100644
--- a/tests/queries/0_stateless/02864_statistics_bugs.sql
+++ b/tests/queries/0_stateless/02864_statistics_bugs.sql
@@ -7,3 +7,21 @@ CREATE TABLE bug_67742 (a Float64 STATISTICS(tdigest)) Engine = MergeTree() ORDE
 INSERT INTO bug_67742 SELECT number FROM system.numbers LIMIT 10000;
 SELECT count(*) FROM bug_67742 WHERE a < '10';
 DROP TABLE bug_67742;
+
+DROP TABLE IF EXISTS bug_67742;
+CREATE TABLE bug_67742 (a Int32 STATISTICS(tdigest)) Engine = MergeTree() ORDER BY tuple();
+INSERT INTO bug_67742 SELECT number FROM system.numbers LIMIT 10000;
+SELECT count(*) FROM bug_67742 WHERE a < '10.5'; -- { serverError TYPE_MISMATCH }
+DROP TABLE bug_67742;
+
+DROP TABLE IF EXISTS bug_67742;
+CREATE TABLE bug_67742 (a Int32 STATISTICS(tdigest)) Engine = MergeTree() ORDER BY tuple();
+INSERT INTO bug_67742 SELECT number FROM system.numbers LIMIT 10000;
+SELECT count(*) FROM bug_67742 WHERE a < 10.5;
+DROP TABLE bug_67742;
+
+DROP TABLE IF EXISTS bug_67742;
+CREATE TABLE bug_67742 (a Int16 STATISTICS(tdigest)) Engine = MergeTree() ORDER BY tuple();
+INSERT INTO bug_67742 SELECT number FROM system.numbers LIMIT 10000;
+SELECT count(*) FROM bug_67742 WHERE a < '9999999999999999999999999';
+DROP TABLE bug_67742;

From cf8ddbc15e0bb1143ff1737aec80171b518b24bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=D1=81hael=20Stetsyuk?=
 <59827607+mstetsyuk@users.noreply.github.com>
Date: Wed, 7 Aug 2024 19:03:20 +0100
Subject: [PATCH 38/56] Update src/Databases/DatabaseReplicated.cpp

Co-authored-by: Alexander Tokmakov <tavplubix@clickhouse.com>
---
 src/Databases/DatabaseReplicated.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp
index 213c94d4d94..09dd2065b19 100644
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@@ -390,7 +390,8 @@ ReplicasInfo DatabaseReplicated::tryGetReplicasInfo(const ClusterPtr & cluster_)
         }
 
         return replicas_info;
-    } catch (...)
+    }
+    catch (...)
     {
         tryLogCurrentException(log);
         return {};

From f08cb90fe3f017d44e4dd58c7e696396d6bf5ac0 Mon Sep 17 00:00:00 2001
From: Michael Stetsyuk <michael.stetsyuk@clickhouse.com>
Date: Wed, 7 Aug 2024 19:54:55 +0000
Subject: [PATCH 39/56] fxs

---
 src/Databases/DatabaseReplicated.cpp          | 9 ++++++++-
 src/Storages/System/StorageSystemClusters.cpp | 6 +++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp
index 09dd2065b19..fe00c1c60aa 100644
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@@ -379,10 +379,17 @@ ReplicasInfo DatabaseReplicated::tryGetReplicasInfo(const ClusterPtr & cluster_)
                 auto replica_active = zk_res[2 * global_replica_index + 1];
                 auto replica_log_ptr = zk_res[2 * global_replica_index + 2];
 
+                UInt64 recovery_time = 0;
+                {
+                    std::lock_guard lock(ddl_worker_mutex);
+                    if (replica.is_local && ddl_worker)
+                        recovery_time = ddl_worker->getCurrentInitializationDurationMs();
+                }
+
                 replicas_info[global_replica_index] = ReplicaInfo{
                     .is_active = replica_active.error == Coordination::Error::ZOK,
                     .replication_lag = replica_log_ptr.error != Coordination::Error::ZNONODE ? std::optional(max_log_ptr - parse<UInt32>(replica_log_ptr.data)) : std::nullopt,
-                    .recovery_time = replica.is_local && ddl_worker ? ddl_worker->getCurrentInitializationDurationMs() : 0,
+                    .recovery_time = recovery_time,
                 };
 
                 ++global_replica_index;
diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp
index db1955c2e99..9493d2c97ab 100644
--- a/src/Storages/System/StorageSystemClusters.cpp
+++ b/src/Storages/System/StorageSystemClusters.cpp
@@ -70,8 +70,9 @@ void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const std
     const auto & shards_info = cluster->getShardsInfo();
     const auto & addresses_with_failover = cluster->getShardsAddresses();
 
+    size_t recovery_time_column_idx = columns_mask.size() - 1, replication_lag_column_idx = columns_mask.size() - 2, is_active_column_idx = columns_mask.size() - 3;
     ReplicasInfo replicas_info;
-    if (replicated)
+    if (replicated && (columns_mask[recovery_time_column_idx] || columns_mask[replication_lag_column_idx] || columns_mask[is_active_column_idx]))
         replicas_info = replicated->tryGetReplicasInfo(name_and_cluster.second);
 
     size_t replica_idx = 0;
@@ -122,6 +123,7 @@ void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const std
             if (columns_mask[src_index++])
                 res_columns[res_index++]->insert(address.database_replica_name);
 
+            /// make sure these three columns remain the last ones
             if (columns_mask[src_index++])
             {
                 if (replicas_info.empty())
@@ -132,7 +134,6 @@ void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const std
                     res_columns[res_index++]->insert(replica_info.is_active);
                 }
             }
-
             if (columns_mask[src_index++])
             {
                 if (replicas_info.empty())
@@ -146,7 +147,6 @@ void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const std
                         res_columns[res_index++]->insertDefault();
                 }
             }
-
             if (columns_mask[src_index++])
             {
                 if (replicas_info.empty())

From 301ac5dab7222035cdcc4fa32d061eb9f2294c05 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 9 Jul 2024 17:51:48 +0200
Subject: [PATCH 40/56] Fix possible data-race StorageKafka with
 statistics_interval_ms>0

The problem here is that ignorelist did not work by some reason, if I
will look at the ignored functions it should not contain any TSan
interseption code, while it does:

    $ lldb-13 clickhouse
    (lldb) target create "clickhouse"
    disas -n rd_avg_rollover
    Current executable set to '/home/azat/ch/tmp/tsan-test/clickhouse' (x86_64).
    (lldb) disas -n rd_avg_rollover
    clickhouse`rd_kafka_stats_emit_avg:
    clickhouse[0x1cbf84a7] <+39>:  leaq   0x30(%r15), %r12
    clickhouse[0x1cbf84ab] <+43>:  movq   %r12, %rdi
    clickhouse[0x1cbf84ae] <+46>:  callq  0x1ccdad40                ; rdk_thread_mutex_lock at tinycthread.c:111
    clickhouse[0x1cbf84b3] <+51>:  leaq   0x58(%r15), %rdi
    clickhouse[0x1cbf84b7] <+55>:  callq  0x71b5390                 ; __tsan_read4
    clickhouse[0x1cbf84bc] <+60>:  cmpl   $0x0, 0x58(%r15)
    clickhouse[0x1cbf84c1] <+65>:  je     0x1cbf8595                ; <+277> [inlined] rd_avg_rollover + 238 at rdavg.h
    clickhouse[0x1cbf84c7] <+71>:  leaq   -0xc8(%rbp), %rdi
    clickhouse[0x1cbf84ce] <+78>:  xorl   %esi, %esi
    clickhouse[0x1cbf84d0] <+80>:  callq  0x1ccdac80                ; rdk_thread_mutex_init at tinycthread.c:62
    clickhouse[0x1cbf84d5] <+85>:  leaq   0x5c(%r15), %rdi
    clickhouse[0x1cbf84d9] <+89>:  callq  0x71b5390                 ; __tsan_read4

    (lldb) disas -n rd_avg_calc
    clickhouse`rd_kafka_broker_ops_io_serve:
    clickhouse[0x1cbdf086] <+1990>: leaq   0x5a4(%rbx), %rdi
    clickhouse[0x1cbdf08d] <+1997>: callq  0x71b5390                 ; __tsan_read4
    clickhouse[0x1cbdf092] <+2002>: cmpl   $0x0, 0x5a4(%rbx)
    clickhouse[0x1cbdf099] <+2009>: je     0x1cbdf12b                ; <+2155> [inlined] rd_kafka_broker_timeout_scan + 719 at rdkafka_broker.c

I guess the reason is that they had been inlined

So now rd_avg_calc() guarded with a mutex.

Refs: https://github.com/ClickHouse/librdkafka/pull/11
Fixes: https://github.com/ClickHouse/ClickHouse/issues/60939
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 contrib/librdkafka         | 2 +-
 tests/tsan_ignorelist.txt  | 4 +---
 tests/ubsan_ignorelist.txt | 1 +
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/contrib/librdkafka b/contrib/librdkafka
index 2d2aab6f5b7..39d4ed49ccf 160000
--- a/contrib/librdkafka
+++ b/contrib/librdkafka
@@ -1 +1 @@
-Subproject commit 2d2aab6f5b79db1cfca15d7bf0dee75d00d82082
+Subproject commit 39d4ed49ccf3406e2bf825d5d7b0903b5a290782
diff --git a/tests/tsan_ignorelist.txt b/tests/tsan_ignorelist.txt
index 96bf6e4251f..2a31fc9bc15 100644
--- a/tests/tsan_ignorelist.txt
+++ b/tests/tsan_ignorelist.txt
@@ -5,11 +5,9 @@
 #
 # Caveats for generic entry "fun":
 # - does not work for __attribute__((__always_inline__))
+# - and may not work for functions that had been inlined
 # - requires asterisk at the beginning *and* end for static functions
 #
 [thread]
 # https://github.com/ClickHouse/ClickHouse/issues/55629
 fun:rd_kafka_broker_set_nodename
-# https://github.com/ClickHouse/ClickHouse/issues/60443
-fun:*rd_avg_calc*
-fun:*rd_avg_rollover*
diff --git a/tests/ubsan_ignorelist.txt b/tests/ubsan_ignorelist.txt
index 57d6598afa6..b75819b3f4b 100644
--- a/tests/ubsan_ignorelist.txt
+++ b/tests/ubsan_ignorelist.txt
@@ -9,6 +9,7 @@
 #
 # Caveats for generic entry "fun":
 # - does not work for __attribute__((__always_inline__))
+# - and may not work for functions that had been inlined
 # - requires asterisk at the beginning *and* end for static functions
 #
 [undefined]

From 5c97205742ff12f28cab2b853a9473cbb59edfe3 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 8 Aug 2024 07:23:10 +0000
Subject: [PATCH 41/56] Reapply "Bump rocksdb from v8.10 to v9.4 + enable
 jemalloc and liburing"

This reverts commit ff8ce505d752eff1c867d73b47e39a03f0f13622.
---
 contrib/CMakeLists.txt               |  2 +-
 contrib/rocksdb                      |  2 +-
 contrib/rocksdb-cmake/CMakeLists.txt | 44 ++++++++++++++++++----------
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 977efda15ff..eb3afe0ccdf 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -71,7 +71,6 @@ add_contrib (zlib-ng-cmake zlib-ng)
 add_contrib (bzip2-cmake bzip2)
 add_contrib (minizip-ng-cmake minizip-ng)
 add_contrib (snappy-cmake snappy)
-add_contrib (rocksdb-cmake rocksdb)
 add_contrib (thrift-cmake thrift)
 # parquet/arrow/orc
 add_contrib (arrow-cmake arrow) # requires: snappy, thrift, double-conversion
@@ -148,6 +147,7 @@ add_contrib (hive-metastore-cmake hive-metastore) # requires: thrift, avro, arro
 add_contrib (cppkafka-cmake cppkafka)
 add_contrib (libpqxx-cmake libpqxx)
 add_contrib (libpq-cmake libpq)
+add_contrib (rocksdb-cmake rocksdb) # requires: jemalloc, snappy, zlib, lz4, zstd, liburing
 add_contrib (nuraft-cmake NuRaft)
 add_contrib (fast_float-cmake fast_float)
 add_contrib (idna-cmake idna)
diff --git a/contrib/rocksdb b/contrib/rocksdb
index 49ce8a1064d..5f003e4a22d 160000
--- a/contrib/rocksdb
+++ b/contrib/rocksdb
@@ -1 +1 @@
-Subproject commit 49ce8a1064dd1ad89117899839bf136365e49e79
+Subproject commit 5f003e4a22d2e48e37c98d9620241237cd30dd24
diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt
index 57c056532c6..7e5e9a28d0f 100644
--- a/contrib/rocksdb-cmake/CMakeLists.txt
+++ b/contrib/rocksdb-cmake/CMakeLists.txt
@@ -5,36 +5,38 @@ if (NOT ENABLE_ROCKSDB OR NO_SSE3_OR_HIGHER) # assumes SSE4.2 and PCLMUL
   return()
 endif()
 
-# not in original build system, otherwise xxHash.cc fails to compile with ClickHouse C++23 default
-set (CMAKE_CXX_STANDARD 20)
-
-# Always disable jemalloc for rocksdb by default because it introduces non-standard jemalloc APIs
-option(WITH_JEMALLOC "build with JeMalloc" OFF)
-
-option(WITH_LIBURING "build with liburing" OFF) # TODO could try to enable this conditionally, depending on ClickHouse's ENABLE_LIBURING
-
 # ClickHouse cannot be compiled without snappy, lz4, zlib, zstd
 option(WITH_SNAPPY "build with SNAPPY" ON)
 option(WITH_LZ4 "build with lz4" ON)
 option(WITH_ZLIB "build with zlib" ON)
 option(WITH_ZSTD "build with zstd" ON)
 
-if(WITH_SNAPPY)
+if (ENABLE_JEMALLOC)
+  add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE)
+  list (APPEND THIRDPARTY_LIBS ch_contrib::jemalloc)
+endif ()
+
+if (ENABLE_LIBURING)
+  add_definitions(-DROCKSDB_IOURING_PRESENT)
+  list (APPEND THIRDPARTY_LIBS ch_contrib::liburing)
+endif ()
+
+if (WITH_SNAPPY)
   add_definitions(-DSNAPPY)
   list(APPEND THIRDPARTY_LIBS ch_contrib::snappy)
 endif()
 
-if(WITH_ZLIB)
+if (WITH_ZLIB)
   add_definitions(-DZLIB)
   list(APPEND THIRDPARTY_LIBS ch_contrib::zlib)
 endif()
 
-if(WITH_LZ4)
+if (WITH_LZ4)
   add_definitions(-DLZ4)
   list(APPEND THIRDPARTY_LIBS ch_contrib::lz4)
 endif()
 
-if(WITH_ZSTD)
+if (WITH_ZSTD)
   add_definitions(-DZSTD)
   list(APPEND THIRDPARTY_LIBS ch_contrib::zstd)
 endif()
@@ -88,6 +90,7 @@ set(SOURCES
     ${ROCKSDB_SOURCE_DIR}/cache/sharded_cache.cc
     ${ROCKSDB_SOURCE_DIR}/cache/tiered_secondary_cache.cc
     ${ROCKSDB_SOURCE_DIR}/db/arena_wrapped_db_iter.cc
+    ${ROCKSDB_SOURCE_DIR}/db/attribute_group_iterator_impl.cc
     ${ROCKSDB_SOURCE_DIR}/db/blob/blob_contents.cc
     ${ROCKSDB_SOURCE_DIR}/db/blob/blob_fetcher.cc
     ${ROCKSDB_SOURCE_DIR}/db/blob/blob_file_addition.cc
@@ -104,6 +107,7 @@ set(SOURCES
     ${ROCKSDB_SOURCE_DIR}/db/blob/prefetch_buffer_collection.cc
     ${ROCKSDB_SOURCE_DIR}/db/builder.cc
     ${ROCKSDB_SOURCE_DIR}/db/c.cc
+    ${ROCKSDB_SOURCE_DIR}/db/coalescing_iterator.cc
     ${ROCKSDB_SOURCE_DIR}/db/column_family.cc
     ${ROCKSDB_SOURCE_DIR}/db/compaction/compaction.cc
     ${ROCKSDB_SOURCE_DIR}/db/compaction/compaction_iterator.cc
@@ -124,6 +128,7 @@ set(SOURCES
     ${ROCKSDB_SOURCE_DIR}/db/db_impl/db_impl_write.cc
     ${ROCKSDB_SOURCE_DIR}/db/db_impl/db_impl_compaction_flush.cc
     ${ROCKSDB_SOURCE_DIR}/db/db_impl/db_impl_files.cc
+    ${ROCKSDB_SOURCE_DIR}/db/db_impl/db_impl_follower.cc
     ${ROCKSDB_SOURCE_DIR}/db/db_impl/db_impl_open.cc
     ${ROCKSDB_SOURCE_DIR}/db/db_impl/db_impl_debug.cc
     ${ROCKSDB_SOURCE_DIR}/db/db_impl/db_impl_experimental.cc
@@ -181,6 +186,7 @@ set(SOURCES
     ${ROCKSDB_SOURCE_DIR}/env/env_encryption.cc
     ${ROCKSDB_SOURCE_DIR}/env/file_system.cc
     ${ROCKSDB_SOURCE_DIR}/env/file_system_tracer.cc
+    ${ROCKSDB_SOURCE_DIR}/env/fs_on_demand.cc
     ${ROCKSDB_SOURCE_DIR}/env/fs_remap.cc
     ${ROCKSDB_SOURCE_DIR}/env/mock_env.cc
     ${ROCKSDB_SOURCE_DIR}/env/unique_id_gen.cc
@@ -368,6 +374,7 @@ set(SOURCES
     ${ROCKSDB_SOURCE_DIR}/utilities/persistent_cache/volatile_tier_impl.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/simulator_cache/cache_simulator.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/simulator_cache/sim_cache.cc
+    ${ROCKSDB_SOURCE_DIR}/utilities/table_properties_collectors/compact_for_tiering_collector.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/table_properties_collectors/compact_on_deletion_collector.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/trace/file_trace_reader_writer.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/trace/replayer_impl.cc
@@ -388,6 +395,7 @@ set(SOURCES
     ${ROCKSDB_SOURCE_DIR}/utilities/transactions/write_prepared_txn_db.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/transactions/write_unprepared_txn.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/transactions/write_unprepared_txn_db.cc
+    ${ROCKSDB_SOURCE_DIR}/utilities/types_util.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/ttl/db_ttl_impl.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/wal_filter.cc
     ${ROCKSDB_SOURCE_DIR}/utilities/write_batch_with_index/write_batch_with_index.cc
@@ -418,14 +426,18 @@ if(HAS_ARMV8_CRC)
 endif(HAS_ARMV8_CRC)
 
 list(APPEND SOURCES
-    "${ROCKSDB_SOURCE_DIR}/port/port_posix.cc"
-    "${ROCKSDB_SOURCE_DIR}/env/env_posix.cc"
-    "${ROCKSDB_SOURCE_DIR}/env/fs_posix.cc"
-    "${ROCKSDB_SOURCE_DIR}/env/io_posix.cc")
+    ${ROCKSDB_SOURCE_DIR}/port/port_posix.cc
+    ${ROCKSDB_SOURCE_DIR}/env/env_posix.cc
+    ${ROCKSDB_SOURCE_DIR}/env/fs_posix.cc
+    ${ROCKSDB_SOURCE_DIR}/env/io_posix.cc)
 
 add_library(_rocksdb ${SOURCES})
 add_library(ch_contrib::rocksdb ALIAS _rocksdb)
 target_link_libraries(_rocksdb PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
 
+# Not in the native build system but useful anyways:
+# Make all functions in xxHash.h inline. Beneficial for performance: https://github.com/Cyan4973/xxHash/tree/v0.8.2#build-modifiers
+target_compile_definitions (_rocksdb PRIVATE XXH_INLINE_ALL)
+
 # SYSTEM is required to overcome some issues
 target_include_directories(_rocksdb SYSTEM BEFORE INTERFACE "${ROCKSDB_SOURCE_DIR}/include")

From 5a17d93bf3968c9d7b19935ca4270d29b8eebef9 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 8 Aug 2024 07:29:20 +0000
Subject: [PATCH 42/56] Fix freebsd build

---
 contrib/rocksdb-cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/rocksdb-cmake/CMakeLists.txt b/contrib/rocksdb-cmake/CMakeLists.txt
index 7e5e9a28d0f..44aa7494607 100644
--- a/contrib/rocksdb-cmake/CMakeLists.txt
+++ b/contrib/rocksdb-cmake/CMakeLists.txt
@@ -11,7 +11,7 @@ option(WITH_LZ4 "build with lz4" ON)
 option(WITH_ZLIB "build with zlib" ON)
 option(WITH_ZSTD "build with zstd" ON)
 
-if (ENABLE_JEMALLOC)
+if (ENABLE_JEMALLOC AND OS_LINUX) # gives compile errors with jemalloc enabled for rocksdb on non-Linux
   add_definitions(-DROCKSDB_JEMALLOC -DJEMALLOC_NO_DEMANGLE)
   list (APPEND THIRDPARTY_LIBS ch_contrib::jemalloc)
 endif ()

From 71a761232c36f5ec3c30df7f3c4c3294641e414f Mon Sep 17 00:00:00 2001
From: Michael Stetsyuk <mike.stetsyuk@gmail.com>
Date: Thu, 8 Aug 2024 08:45:42 +0100
Subject: [PATCH 43/56] empty


From e90487fd54a5c87eb0875191e7547fc2b7f2e229 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 8 Aug 2024 12:57:50 +0200
Subject: [PATCH 44/56] tests/clickhouse-test: remove superior global

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 88ff6753a8f..480dc553247 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -363,8 +363,6 @@ CAPTURE_CLIENT_STACKTRACE = False
 
 
 def kill_process_group(pgid):
-    global CAPTURE_CLIENT_STACKTRACE
-
     print(f"Killing process group {pgid}")
     print(f"Processes in process group {pgid}:")
     print(

From 420f97c8506d7c9d4216c868a75547c48a408de2 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 8 Aug 2024 12:58:40 +0200
Subject: [PATCH 45/56] tests/clickhouse-test: update return type hint in
 run_single_test()

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 480dc553247..2da7550158b 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -1603,7 +1603,7 @@ class TestCase:
 
     def run_single_test(
         self, server_logs_level, client_options
-    ) -> Tuple[Optional[Popen], str, str, str, float]:
+    ) -> Tuple[Optional[Popen], float]:
         args = self.testcase_args
         client = args.testcase_client
         start_time = args.testcase_start_time

From 979f93df12b8901df5af56f019861f910c7637cc Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 8 Aug 2024 13:00:07 +0200
Subject: [PATCH 46/56] tests/clickhouse-test: better english in comment

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 2da7550158b..a70d706c7e7 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -415,8 +415,8 @@ def cleanup_child_processes(pid):
     )
     # Due to start_new_session=True, it is not enough to kill by PGID, we need
     # to look at children processes as well.
-    # But we are hoping that nobody create session in the tests (though it is
-    # possible via timeout(), but we assuming that they will be killed by
+    # But we are hoping that nobody creates session in the tests (though it is
+    # possible via timeout(), but we are assuming that they will be killed by
     # timeout).
     processes = subprocess.check_output(
         f"pgrep --parent {pid}", shell=True, stderr=subprocess.STDOUT

From bc2740aa7000694a20f3db5f13538d030248b4e0 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 8 Aug 2024 13:00:37 +0200
Subject: [PATCH 47/56] tests/clickhouse-test:
 s/RELEASE_BUILD/RELEASE_NON_SANITIZED/g

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index a70d706c7e7..a3d7e0e922d 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -376,7 +376,7 @@ def kill_process_group(pgid):
             # Let's try to dump stacktrace in client (useful to catch issues there)
             os.killpg(pgid, signal.SIGTSTP)
             # Wait some time for clickhouse utilities to gather stacktrace
-            if RELEASE_BUILD:
+            if RELEASE_NON_SANITIZED:
                 sleep(0.5)
             else:
                 sleep(10)
@@ -2407,11 +2407,11 @@ class BuildFlags:
 
 
 # Release and non-sanitizer build
-RELEASE_BUILD = False
+RELEASE_NON_SANITIZED = False
 
 
 def collect_build_flags(args):
-    global RELEASE_BUILD
+    global RELEASE_NON_SANITIZED
 
     result = []
 
@@ -2437,7 +2437,7 @@ def collect_build_flags(args):
     elif b"RelWithDebInfo" in value or b"Release" in value:
         result.append(BuildFlags.RELEASE)
 
-    RELEASE_BUILD = result == [BuildFlags.RELEASE]
+    RELEASE_NON_SANITIZED = result == [BuildFlags.RELEASE]
 
     value = clickhouse_execute(
         args,

From 4b2234f87d7b7ff2033327bd1c03278735438f1a Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 8 Aug 2024 12:06:32 +0000
Subject: [PATCH 48/56] Minor fixups

---
 .../0_stateless/00945_bloom_filter_index.sql       | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/tests/queries/0_stateless/00945_bloom_filter_index.sql b/tests/queries/0_stateless/00945_bloom_filter_index.sql
index 71109df79e7..6e3819e74d3 100644
--- a/tests/queries/0_stateless/00945_bloom_filter_index.sql
+++ b/tests/queries/0_stateless/00945_bloom_filter_index.sql
@@ -375,13 +375,9 @@ SELECT id, ary[indexOf(ary, 'value3')] FROM test_bf_indexOf WHERE ary[indexOf(ar
 
 DROP TABLE IF EXISTS test_bf_indexOf;
 
--- expecting cast function to be unknown
+-- Test for bug #65597
 DROP TABLE IF EXISTS test_bf_cast;
-
-CREATE TABLE test_bf_cast (c Int32, INDEX x1 (c) type bloom_filter) ENGINE = MergeTree ORDER BY c as select 1;
-
-SELECT count() FROM test_bf_cast WHERE cast(c=1 or c=9999 as Bool) settings use_skip_indexes=0;
-
-SELECT count() FROM test_bf_cast WHERE cast(c=1 or c=9999 as Bool) settings use_skip_indexes=1;
-
-DROP TABLE test_bf_cast;
\ No newline at end of file
+CREATE TABLE test_bf_cast (c Int32, INDEX x1 (c) type bloom_filter) ENGINE = MergeTree ORDER BY c AS SELECT 1;
+SELECT count() FROM test_bf_cast WHERE cast(c = 1 OR c = 9999 AS Bool) SETTINGS use_skip_indexes=0;
+SELECT count() FROM test_bf_cast WHERE cast(c = 1 OR c = 9999 AS Bool) SETTINGS use_skip_indexes=1;
+DROP TABLE test_bf_cast;

From eca6bba0388f21c67d0caf24e56032bc6b7cd339 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 8 Aug 2024 12:50:45 +0000
Subject: [PATCH 49/56] Split 00284_external_aggregation

---
 .../00284_external_aggregation.reference      | 20 -----------------
 .../00284_external_aggregation.sql            | 19 +++-------------
 .../00284_external_aggregation_2.reference    | 20 +++++++++++++++++
 .../00284_external_aggregation_2.sql          | 22 +++++++++++++++++++
 .../0_stateless/02099_tsv_raw_format_1.sh     |  3 +++
 .../0_stateless/02099_tsv_raw_format_2.sh     |  3 +++
 6 files changed, 51 insertions(+), 36 deletions(-)
 create mode 100644 tests/queries/0_stateless/00284_external_aggregation_2.reference
 create mode 100644 tests/queries/0_stateless/00284_external_aggregation_2.sql

diff --git a/tests/queries/0_stateless/00284_external_aggregation.reference b/tests/queries/0_stateless/00284_external_aggregation.reference
index be0db217a97..48e30e781e0 100644
--- a/tests/queries/0_stateless/00284_external_aggregation.reference
+++ b/tests/queries/0_stateless/00284_external_aggregation.reference
@@ -1,22 +1,2 @@
 49999995000000	10000000
 499999500000	1000000	15
-100033	2
-100034	2
-100035	2
-100036	2
-100037	2
-100038	2
-100039	2
-10004	2
-100040	2
-100041	2
-100033	2
-100034	2
-100035	2
-100036	2
-100037	2
-100038	2
-100039	2
-10004	2
-100040	2
-100041	2
diff --git a/tests/queries/0_stateless/00284_external_aggregation.sql b/tests/queries/0_stateless/00284_external_aggregation.sql
index c1140faaa28..cdc31ff68c8 100644
--- a/tests/queries/0_stateless/00284_external_aggregation.sql
+++ b/tests/queries/0_stateless/00284_external_aggregation.sql
@@ -1,5 +1,8 @@
 -- Tags: long
 
+-- This test was split in two due to long runtimes in sanitizers.
+-- The other part is 00284_external_aggregation_2.
+
 SET max_bytes_before_external_group_by = 100000000;
 SET max_memory_usage = 410000000;
 SET group_by_two_level_threshold = 100000;
@@ -7,19 +10,3 @@ SET group_by_two_level_threshold_bytes = 50000000;
 
 SELECT sum(k), sum(c) FROM (SELECT number AS k, count() AS c FROM (SELECT * FROM system.numbers LIMIT 10000000) GROUP BY k);
 SELECT sum(k), sum(c), max(u) FROM (SELECT number AS k, count() AS c, uniqArray(range(number % 16)) AS u FROM (SELECT * FROM system.numbers LIMIT 1000000) GROUP BY k);
-
-SET max_memory_usage = 0;
-SET group_by_two_level_threshold = 100000;
-SET max_bytes_before_external_group_by = '1Mi';
-
--- method: key_string & key_string_two_level
-CREATE TABLE t_00284_str(s String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
-INSERT INTO t_00284_str SELECT toString(number) FROM numbers_mt(1e6);
-INSERT INTO t_00284_str SELECT toString(number) FROM numbers_mt(1e6);
-SELECT s, count() FROM t_00284_str GROUP BY s ORDER BY s LIMIT 10 OFFSET 42;
-
--- method: low_cardinality_key_string & low_cardinality_key_string_two_level
-CREATE TABLE t_00284_lc_str(s LowCardinality(String)) ENGINE = MergeTree() ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
-INSERT INTO t_00284_lc_str SELECT toString(number) FROM numbers_mt(1e6);
-INSERT INTO t_00284_lc_str SELECT toString(number) FROM numbers_mt(1e6);
-SELECT s, count() FROM t_00284_lc_str GROUP BY s ORDER BY s LIMIT 10 OFFSET 42;
diff --git a/tests/queries/0_stateless/00284_external_aggregation_2.reference b/tests/queries/0_stateless/00284_external_aggregation_2.reference
new file mode 100644
index 00000000000..71d2e96d4b0
--- /dev/null
+++ b/tests/queries/0_stateless/00284_external_aggregation_2.reference
@@ -0,0 +1,20 @@
+100033	2
+100034	2
+100035	2
+100036	2
+100037	2
+100038	2
+100039	2
+10004	2
+100040	2
+100041	2
+100033	2
+100034	2
+100035	2
+100036	2
+100037	2
+100038	2
+100039	2
+10004	2
+100040	2
+100041	2
diff --git a/tests/queries/0_stateless/00284_external_aggregation_2.sql b/tests/queries/0_stateless/00284_external_aggregation_2.sql
new file mode 100644
index 00000000000..7960e3894d0
--- /dev/null
+++ b/tests/queries/0_stateless/00284_external_aggregation_2.sql
@@ -0,0 +1,22 @@
+-- Tags: long
+
+-- This test was split in two due to long runtimes in sanitizers.
+-- The other part is 00284_external_aggregation.
+
+SET group_by_two_level_threshold_bytes = 50000000;
+SET max_memory_usage = 0;
+SET group_by_two_level_threshold = 100000;
+SET max_bytes_before_external_group_by = '1Mi';
+
+-- method: key_string & key_string_two_level
+CREATE TABLE t_00284_str(s String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
+INSERT INTO t_00284_str SELECT toString(number) FROM numbers_mt(1e6);
+INSERT INTO t_00284_str SELECT toString(number) FROM numbers_mt(1e6);
+SELECT s, count() FROM t_00284_str GROUP BY s ORDER BY s LIMIT 10 OFFSET 42;
+
+-- method: low_cardinality_key_string & low_cardinality_key_string_two_level
+CREATE TABLE t_00284_lc_str(s LowCardinality(String)) ENGINE = MergeTree() ORDER BY tuple() SETTINGS index_granularity = 8192, index_granularity_bytes = '10Mi';
+INSERT INTO t_00284_lc_str SELECT toString(number) FROM numbers_mt(1e6);
+INSERT INTO t_00284_lc_str SELECT toString(number) FROM numbers_mt(1e6);
+SELECT s, count() FROM t_00284_lc_str GROUP BY s ORDER BY s LIMIT 10 OFFSET 42;
+
diff --git a/tests/queries/0_stateless/02099_tsv_raw_format_1.sh b/tests/queries/0_stateless/02099_tsv_raw_format_1.sh
index a3468f46ca0..bd1f8731717 100755
--- a/tests/queries/0_stateless/02099_tsv_raw_format_1.sh
+++ b/tests/queries/0_stateless/02099_tsv_raw_format_1.sh
@@ -1,6 +1,9 @@
 #!/usr/bin/env bash
 # Tags: long
 
+# This test was split in two due to long runtimes in sanitizers.
+# The other part is 02099_tsv_raw_format_2.sh.
+
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
diff --git a/tests/queries/0_stateless/02099_tsv_raw_format_2.sh b/tests/queries/0_stateless/02099_tsv_raw_format_2.sh
index d6034a0616f..9f57eea42f2 100755
--- a/tests/queries/0_stateless/02099_tsv_raw_format_2.sh
+++ b/tests/queries/0_stateless/02099_tsv_raw_format_2.sh
@@ -1,6 +1,9 @@
 #!/usr/bin/env bash
 # Tags: long
 
+# This test was split in two due to long runtimes in sanitizers.
+# The other part is 02099_tsv_raw_format_1.sh.
+#
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh

From 308947ee0b14e4495ea81476321fbea459ded4a0 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Thu, 8 Aug 2024 18:31:24 +0200
Subject: [PATCH 50/56] Fix

---
 ...ckhouse_local_interactive_exception.expect |  29 -----
 ...ckhouse_local_interactive_exception.python | 110 ++++++++++++++++++
 ...ouse_local_interactive_exception.reference |   1 +
 3 files changed, 111 insertions(+), 29 deletions(-)
 delete mode 100755 tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.expect
 create mode 100644 tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python

diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.expect b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.expect
deleted file mode 100755
index add977c4fce..00000000000
--- a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.expect
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/expect -f
-
-set basedir [file dirname $argv0]
-set basename [file tail $argv0]
-if {[info exists env(CLICKHOUSE_TMP)]} {
-    set CLICKHOUSE_TMP $env(CLICKHOUSE_TMP)
-} else {
-    set CLICKHOUSE_TMP "."
-}
-exp_internal -f $CLICKHOUSE_TMP/$basename.debuglog 0
-
-log_user 0
-set timeout 20
-match_max 100000
-
-expect_after {
-    -i $any_spawn_id eof { exp_continue }
-    -i $any_spawn_id timeout { exit 1 }
-}
-
-spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_LOCAL  --disable_suggestion"
-
-expect ":) "
-send -- "insert into table function null() format TSV some trash here 123 \n 456\r"
-expect "CANNOT_PARSE_INPUT_ASSERTION_FAILED"
-expect ":) "
-
-send -- ""
-expect eof
diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
new file mode 100644
index 00000000000..03f8d493ec2
--- /dev/null
+++ b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
@@ -0,0 +1,110 @@
+import pty
+import os
+import shlex
+import time
+import multiprocessing
+
+COMPLETION_TIMEOUT_SECONDS = 30
+DEBUG_LOG = os.path.join(
+    os.environ["CLICKHOUSE_TMP"],
+    os.path.basename(os.path.abspath(__file__)).strip(".python") + ".debuglog",
+)
+
+STATE_MAP = {
+    -1: "process did not start",
+    0: "all good",
+    1: "process started and said ':)'",
+    2: "prompt search was started",
+    3: "prompt is missing",
+}
+
+
+def run_with_timeout(func, args, timeout):
+    for _ in range(5):
+        state = multiprocessing.Value("i", -1)
+        process = multiprocessing.Process(
+            target=func, args=args, kwargs={"state": state}
+        )
+        process.start()
+        process.join(timeout)
+
+        if state.value in (0, 3):
+            return
+
+        if process.is_alive():
+            process.terminate()
+
+            if state.value == -1:
+                continue
+
+            print(f"Timeout, state: {STATE_MAP[state.value]}")
+            return
+
+
+def test_completion(program, argv, prompt, state=None):
+    shell_pid, master = pty.fork()
+    if shell_pid == 0:
+        os.execv(program, argv)
+    else:
+        try:
+            debug_log_fd = open(DEBUG_LOG, "a")
+
+            output_b = os.read(master, 4096)
+            output = output_b.decode()
+            debug_log_fd.write(repr(output_b) + "\n")
+            debug_log_fd.flush()
+            while not ":)" in output:
+                output_b = os.read(master, 4096)
+                output += output_b.decode()
+                debug_log_fd.write(repr(output_b) + "\n")
+                debug_log_fd.flush()
+
+            state.value = 1
+
+            os.write(master, bytes(prompt.encode()))
+            output_b = os.read(master, 4096)
+            output = output_b.decode()
+            debug_log_fd.write(repr(output_b) + "\n")
+            debug_log_fd.flush()
+            while not prompt[:-10] in output:
+                output_b = os.read(master, 4096)
+                output += output_b.decode()
+                debug_log_fd.write(repr(output_b) + "\n")
+                debug_log_fd.flush()
+
+            time.sleep(0.01)
+            os.write(master, b"\r")
+
+            state.value = 2
+
+            output_b = os.read(master, 4096)
+            output = output_b.decode()
+            debug_log_fd.write(repr(output_b) + "\n")
+            debug_log_fd.flush()
+            while not "CANNOT_PARSE_INPUT_ASSERTION_FAILED" in output:
+                output_b = os.read(master, 4096)
+                output += output_b.decode()
+                debug_log_fd.write(repr(output_b) + "\n")
+                debug_log_fd.flush()
+
+            while not ":)" in output:
+                output_b = os.read(master, 4096)
+                output += output_b.decode()
+                debug_log_fd.write(repr(output_b) + "\n")
+                debug_log_fd.flush()
+
+            print("OK")
+            state.value = 0
+        finally:
+            os.close(master)
+            debug_log_fd.close()
+
+
+if __name__ == "__main__":
+    clickhouse_local = os.environ["CLICKHOUSE_LOCAL"]
+    args = shlex.split(clickhouse_local)
+    args.append("--disable_suggestion")
+    args.append("--highlight=0")
+    run_with_timeout(
+        test_completion, [args[0], args, "insert into table function null() format TSV some trash here 123 \n 456"], COMPLETION_TIMEOUT_SECONDS
+    )
diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.reference b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.reference
index e69de29bb2d..d86bac9de59 100644
--- a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.reference
+++ b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.reference
@@ -0,0 +1 @@
+OK

From 34d2c71eadfefd79e3cfb62d46d156cdbdaab681 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Thu, 8 Aug 2024 18:50:16 +0200
Subject: [PATCH 51/56] Cleanup

---
 ...ckhouse_local_interactive_exception.python | 44 +++++++------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
index 03f8d493ec2..9527991c36e 100644
--- a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
+++ b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
@@ -40,6 +40,18 @@ def run_with_timeout(func, args, timeout):
             print(f"Timeout, state: {STATE_MAP[state.value]}")
             return
 
+def expect(text, master, debug_log_fd):
+    output_b = os.read(master, 4096)
+    output = output_b.decode()
+    debug_log_fd.write(repr(output_b) + "\n")
+    debug_log_fd.flush()
+    while not text in output:
+        output_b = os.read(master, 4096)
+        output += output_b.decode()
+        debug_log_fd.write(repr(output_b) + "\n")
+        debug_log_fd.flush()
+
+    return output
 
 def test_completion(program, argv, prompt, state=None):
     shell_pid, master = pty.fork()
@@ -49,43 +61,17 @@ def test_completion(program, argv, prompt, state=None):
         try:
             debug_log_fd = open(DEBUG_LOG, "a")
 
-            output_b = os.read(master, 4096)
-            output = output_b.decode()
-            debug_log_fd.write(repr(output_b) + "\n")
-            debug_log_fd.flush()
-            while not ":)" in output:
-                output_b = os.read(master, 4096)
-                output += output_b.decode()
-                debug_log_fd.write(repr(output_b) + "\n")
-                debug_log_fd.flush()
+            expect(":)", master, debug_log_fd)
 
             state.value = 1
-
             os.write(master, bytes(prompt.encode()))
-            output_b = os.read(master, 4096)
-            output = output_b.decode()
-            debug_log_fd.write(repr(output_b) + "\n")
-            debug_log_fd.flush()
-            while not prompt[:-10] in output:
-                output_b = os.read(master, 4096)
-                output += output_b.decode()
-                debug_log_fd.write(repr(output_b) + "\n")
-                debug_log_fd.flush()
+            expect(prompt[:-10], master, debug_log_fd)
 
             time.sleep(0.01)
             os.write(master, b"\r")
-
             state.value = 2
 
-            output_b = os.read(master, 4096)
-            output = output_b.decode()
-            debug_log_fd.write(repr(output_b) + "\n")
-            debug_log_fd.flush()
-            while not "CANNOT_PARSE_INPUT_ASSERTION_FAILED" in output:
-                output_b = os.read(master, 4096)
-                output += output_b.decode()
-                debug_log_fd.write(repr(output_b) + "\n")
-                debug_log_fd.flush()
+            output = expect("CANNOT_PARSE_INPUT_ASSERTION_FAILED", master, debug_log_fd)
 
             while not ":)" in output:
                 output_b = os.read(master, 4096)

From 3062f221eb537a49f61bdc4ee058e011ed4d2246 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Thu, 8 Aug 2024 18:37:31 +0000
Subject: [PATCH 52/56] Automatic style fix

---
 ...02164_clickhouse_local_interactive_exception.python | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
index 9527991c36e..5ca7ac4e286 100644
--- a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
+++ b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
@@ -40,6 +40,7 @@ def run_with_timeout(func, args, timeout):
             print(f"Timeout, state: {STATE_MAP[state.value]}")
             return
 
+
 def expect(text, master, debug_log_fd):
     output_b = os.read(master, 4096)
     output = output_b.decode()
@@ -53,6 +54,7 @@ def expect(text, master, debug_log_fd):
 
     return output
 
+
 def test_completion(program, argv, prompt, state=None):
     shell_pid, master = pty.fork()
     if shell_pid == 0:
@@ -92,5 +94,11 @@ if __name__ == "__main__":
     args.append("--disable_suggestion")
     args.append("--highlight=0")
     run_with_timeout(
-        test_completion, [args[0], args, "insert into table function null() format TSV some trash here 123 \n 456"], COMPLETION_TIMEOUT_SECONDS
+        test_completion,
+        [
+            args[0],
+            args,
+            "insert into table function null() format TSV some trash here 123 \n 456",
+        ],
+        COMPLETION_TIMEOUT_SECONDS,
     )

From 343accc92d86748b975f845e1154837d585dcd54 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Thu, 8 Aug 2024 20:41:11 +0200
Subject: [PATCH 53/56] Disable randomization of `trace_profile_events` in
 clickhouse-test

---
 tests/clickhouse-test | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index ffdd6169777..907d773337a 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -836,7 +836,6 @@ class SettingsRandomizer:
         "cross_join_min_bytes_to_compress": lambda: random.choice([0, 1, 100000000]),
         "min_external_table_block_size_bytes": lambda: random.choice([0, 1, 100000000]),
         "max_parsing_threads": lambda: random.choice([0, 1, 10]),
-        "trace_profile_events": lambda: random.randint(0, 1),
         "optimize_functions_to_subcolumns": lambda: random.randint(0, 1),
     }
 

From 6e3df43ae3c13c037fa8e88ae7d102f94bb7398a Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Thu, 8 Aug 2024 22:09:58 +0200
Subject: [PATCH 54/56] Cleanup

---
 ...64_clickhouse_local_interactive_exception.python | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
index 5ca7ac4e286..4c2df9556a1 100644
--- a/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
+++ b/tests/queries/0_stateless/02164_clickhouse_local_interactive_exception.python
@@ -41,11 +41,7 @@ def run_with_timeout(func, args, timeout):
             return
 
 
-def expect(text, master, debug_log_fd):
-    output_b = os.read(master, 4096)
-    output = output_b.decode()
-    debug_log_fd.write(repr(output_b) + "\n")
-    debug_log_fd.flush()
+def expect(text, master, debug_log_fd, output=""):
     while not text in output:
         output_b = os.read(master, 4096)
         output += output_b.decode()
@@ -74,12 +70,7 @@ def test_completion(program, argv, prompt, state=None):
             state.value = 2
 
             output = expect("CANNOT_PARSE_INPUT_ASSERTION_FAILED", master, debug_log_fd)
-
-            while not ":)" in output:
-                output_b = os.read(master, 4096)
-                output += output_b.decode()
-                debug_log_fd.write(repr(output_b) + "\n")
-                debug_log_fd.flush()
+            expect(":)", master, debug_log_fd, output)
 
             print("OK")
             state.value = 0

From eb4ea0757730f8b8e59b9230d72aea3ca4bb1ff3 Mon Sep 17 00:00:00 2001
From: justindeguzman <justin@justindeguzman.net>
Date: Thu, 8 Aug 2024 16:55:39 -0700
Subject: [PATCH 55/56] [Docs] Fix broken links

---
 docs/en/interfaces/prometheus.md | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/en/interfaces/prometheus.md b/docs/en/interfaces/prometheus.md
index 75a68c59219..bf541901b34 100644
--- a/docs/en/interfaces/prometheus.md
+++ b/docs/en/interfaces/prometheus.md
@@ -25,7 +25,7 @@ ClickHouse can expose its own metrics for scraping from Prometheus:
 </prometheus>
 
 Section `<prometheus.handlers>` can be used to make more extended handlers.
-This section is similar to [<http_handlers>](http.md) but works for prometheus protocols:
+This section is similar to [<http_handlers>](/en/interfaces/http) but works for prometheus protocols:
 
 ```xml
 <prometheus>
@@ -51,11 +51,11 @@ Settings:
 |---|---|---|---|
 | `port` | none | Port for serving the exposing metrics protocol. |
 | `endpoint` | `/metrics` | HTTP endpoint for scraping metrics by prometheus server. Starts with `/`. Should not be used with the `<handlers>` section. |
-| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](http.md) section. |
-| `metrics` | true | Expose metrics from the [system.metrics](../operations/system-tables/metrics.md) table. |
-| `asynchronous_metrics` | true | Expose current metrics values from the [system.asynchronous_metrics](../operations/system-tables/asynchronous_metrics.md) table. |
-| `events` | true | Expose metrics from the [system.events](../operations/system-tables/events.md) table. |
-| `errors` | true | Expose the number of errors by error codes occurred since the last server restart. This information could be obtained from the [system.errors](../operations/system-tables/errors.md) as well. |
+| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](/en/interfaces/http) section. |
+| `metrics` | true | Expose metrics from the [system.metrics](/en/operations/system-tables/metrics) table. |
+| `asynchronous_metrics` | true | Expose current metrics values from the [system.asynchronous_metrics](/en/operations/system-tables/asynchronous_metrics) table. |
+| `events` | true | Expose metrics from the [system.events](/en/operations/system-tables/events) table. |
+| `errors` | true | Expose the number of errors by error codes occurred since the last server restart. This information could be obtained from the [system.errors](/en/operations/system-tables/errors) as well. |
 
 Check (replace `127.0.0.1` with the IP addr or hostname of your ClickHouse server):
 ```bash
@@ -65,7 +65,7 @@ curl 127.0.0.1:9363/metrics
 ## Remote-write protocol {#remote-write}
 
 ClickHouse supports the [remote-write](https://prometheus.io/docs/specs/remote_write_spec/) protocol.
-Data are received by this protocol and written to a [TimeSeries](../engines/table-engines/integrations/time-series.md) table
+Data are received by this protocol and written to a [TimeSeries](/en/engines/table-engines/special/time_series) table
 (which should be created beforehand).
 
 ```xml
@@ -89,14 +89,14 @@ Settings:
 | Name | Default | Description |
 |---|---|---|---|
 | `port` | none | Port for serving the `remote-write` protocol. |
-| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](http.md) section. |
-| `table` | none | The name of a [TimeSeries](../engines/table-engines/integrations/time-series.md) table to write data received by the `remote-write` protocol. This name can optionally contain the name of a database too. |
+| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](/en/interfaces/http) section. |
+| `table` | none | The name of a [TimeSeries](/en/engines/table-engines/special/time_series) table to write data received by the `remote-write` protocol. This name can optionally contain the name of a database too. |
 | `database` | none | The name of a database where the table specified in the `table` setting is located if it's not specified in the `table` setting. |
 
 ## Remote-read protocol {#remote-read}
 
 ClickHouse supports the [remote-read](https://prometheus.io/docs/prometheus/latest/querying/remote_read_api/) protocol.
-Data are read from a [TimeSeries](../engines/table-engines/integrations/time-series.md) table and sent via this protocol.
+Data are read from a [TimeSeries](/en/engines/table-engines/special/time_series) table and sent via this protocol.
 
 ```xml
 <prometheus>
@@ -119,8 +119,8 @@ Settings:
 | Name | Default | Description |
 |---|---|---|---|
 | `port` | none | Port for serving the `remote-read` protocol. |
-| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](http.md) section. |
-| `table` | none | The name of a [TimeSeries](../engines/table-engines/integrations/time-series.md) table to read data to send by the `remote-read` protocol. This name can optionally contain the name of a database too. |
+| `url` / `headers` / `method` | none | Filters used to find a matching handler for a request. Similar to the fields with the same names in the [<http_handlers>](/en/interfaces/http) section. |
+| `table` | none | The name of a [TimeSeries](/en/engines/table-engines/special/time_series) table to read data to send by the `remote-read` protocol. This name can optionally contain the name of a database too. |
 | `database` | none | The name of a database where the table specified in the `table` setting is located if it's not specified in the `table` setting. |
 
 ## Configuration for multiple protocols {#multiple-protocols}

From d7442e0670b1900e73299341d44287d21eafd0ad Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Thu, 8 Aug 2024 20:45:43 -0400
Subject: [PATCH 56/56] Fix flacky 02572_query_views_log_background_thread

---
 .../02572_query_views_log_background_thread.sh           | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/queries/0_stateless/02572_query_views_log_background_thread.sh b/tests/queries/0_stateless/02572_query_views_log_background_thread.sh
index a3e428e75c8..509cd03f6c2 100755
--- a/tests/queries/0_stateless/02572_query_views_log_background_thread.sh
+++ b/tests/queries/0_stateless/02572_query_views_log_background_thread.sh
@@ -13,13 +13,16 @@ ${CLICKHOUSE_CLIENT} --ignore-error --multiquery --query "drop table if exists b
 
 ${CLICKHOUSE_CLIENT} --query="create table copy_02572 (key Int) engine=Memory();"
 ${CLICKHOUSE_CLIENT} --query="create table data_02572 (key Int) engine=Memory();"
-${CLICKHOUSE_CLIENT} --query="create table buffer_02572 (key Int) engine=Buffer(currentDatabase(), data_02572, 1, 3, 3, 1, 1e9, 1, 1e9);"
+${CLICKHOUSE_CLIENT} --query="create table buffer_02572 (key Int) engine=Buffer(currentDatabase(), data_02572, 1, 8, 8, 1, 1e9, 1, 1e9);"
 ${CLICKHOUSE_CLIENT} --query="create materialized view mv_02572 to copy_02572 as select * from data_02572;"
 
+start=$(date +%s)
 ${CLICKHOUSE_CLIENT} --query="insert into buffer_02572 values (1);"
 
-# ensure that the flush was not direct
-${CLICKHOUSE_CLIENT} --ignore-error --multiquery --query "select * from data_02572; select * from copy_02572;"
+if [ $(( $(date +%s) - start )) -gt 6 ]; then  # clickhouse test cluster is overloaded, will skip
+    # ensure that the flush was not direct
+    ${CLICKHOUSE_CLIENT} --ignore-error --multiquery --query "select * from data_02572; select * from copy_02572;"
+fi
 
 # we cannot use OPTIMIZE, this will attach query context, so let's wait
 for _ in {1..100}; do