From d507884a1d905bc729682add414d5b4048bfa686 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Fri, 23 Feb 2024 17:59:25 +0100
Subject: [PATCH 01/66] Force reanalysis if parallel replicas changed

---
 src/Interpreters/InterpreterSelectQuery.cpp         | 13 +++++++++++++
 .../02972_parallel_replicas_cte.reference           |  1 +
 .../0_stateless/02972_parallel_replicas_cte.sql     |  9 +++++++++
 3 files changed, 23 insertions(+)

diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index d34294b4c4b..0a2b8d9a0d7 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -777,12 +777,25 @@ InterpreterSelectQuery::InterpreterSelectQuery(
         result_header = getSampleBlockImpl();
     };
 
+    /// This is a hack to make sure we reanalyze if GlobalSubqueriesVisitor changed allow_experimental_parallel_reading_from_replicas
+    UInt64 parallel_replicas_before_analysis
+        = context->hasQueryContext() ? context->getQueryContext()->getSettingsRef().allow_experimental_parallel_reading_from_replicas : 0;
     analyze(shouldMoveToPrewhere());
 
     bool need_analyze_again = false;
     bool can_analyze_again = false;
+
     if (context->hasQueryContext())
     {
+        /// No buts or ifs, if the analysis changed this setting we must reanalyze without parallel replicas
+        if (context->getQueryContext()->getSettingsRef().allow_experimental_parallel_reading_from_replicas
+            != parallel_replicas_before_analysis)
+        {
+            context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
+            context->setSetting("max_parallel_replicas", UInt64{0});
+            need_analyze_again = true;
+        }
+
         /// Check number of calls of 'analyze' function.
         /// If it is too big, we will not analyze the query again not to have exponential blowup.
         std::atomic<size_t> & current_query_analyze_count = context->getQueryContext()->kitchen_sink.analyze_counter;
diff --git a/tests/queries/0_stateless/02972_parallel_replicas_cte.reference b/tests/queries/0_stateless/02972_parallel_replicas_cte.reference
index 449fe3d34e3..3321ade3a24 100644
--- a/tests/queries/0_stateless/02972_parallel_replicas_cte.reference
+++ b/tests/queries/0_stateless/02972_parallel_replicas_cte.reference
@@ -1,3 +1,4 @@
 990000
 990000
 10
+990000
diff --git a/tests/queries/0_stateless/02972_parallel_replicas_cte.sql b/tests/queries/0_stateless/02972_parallel_replicas_cte.sql
index 51ce18784da..c7143b5aa93 100644
--- a/tests/queries/0_stateless/02972_parallel_replicas_cte.sql
+++ b/tests/queries/0_stateless/02972_parallel_replicas_cte.sql
@@ -28,5 +28,14 @@ SETTINGS allow_experimental_analyzer = 0, allow_experimental_parallel_reading_fr
 SELECT count() FROM pr_2 JOIN numbers(10) as pr_1 ON pr_2.a = pr_1.number
 SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3;
 
+-- Being a subquery should still disable parallel replicas
+SELECT *
+FROM
+(
+    WITH filtered_groups AS (SELECT a FROM pr_1 WHERE a >= 10000)
+    SELECT count() FROM pr_2 INNER JOIN filtered_groups ON pr_2.a = filtered_groups.a
+)
+SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3;
+
 DROP TABLE IF EXISTS pr_1;
 DROP TABLE IF EXISTS pr_2;

From f885423a4a1c95118e026042a878226749d93441 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 26 Feb 2024 15:09:05 +0100
Subject: [PATCH 02/66] Cleanup and more tests

---
 src/Interpreters/InterpreterSelectQuery.cpp   |  3 +-
 .../02972_parallel_replicas_cte.sql           | 42 ++++++++++++++++++-
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index 135b535595c..1a9827d30f8 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -778,6 +778,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
     };
 
     /// This is a hack to make sure we reanalyze if GlobalSubqueriesVisitor changed allow_experimental_parallel_reading_from_replicas
+    /// inside the query context (because it doesn't have write access to the main context)
     UInt64 parallel_replicas_before_analysis
         = context->hasQueryContext() ? context->getQueryContext()->getSettingsRef().allow_experimental_parallel_reading_from_replicas : 0;
     analyze(shouldMoveToPrewhere());
@@ -787,7 +788,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
 
     if (context->hasQueryContext())
     {
-        /// No buts or ifs, if the analysis changed this setting we must reanalyze without parallel replicas
+        /// As this query can't be executed with parallel replicas, we must reanalyze it
         if (context->getQueryContext()->getSettingsRef().allow_experimental_parallel_reading_from_replicas
             != parallel_replicas_before_analysis)
         {
diff --git a/tests/queries/0_stateless/02972_parallel_replicas_cte.sql b/tests/queries/0_stateless/02972_parallel_replicas_cte.sql
index c7143b5aa93..c9ab83ff9ad 100644
--- a/tests/queries/0_stateless/02972_parallel_replicas_cte.sql
+++ b/tests/queries/0_stateless/02972_parallel_replicas_cte.sql
@@ -1,5 +1,6 @@
 DROP TABLE IF EXISTS pr_1;
 DROP TABLE IF EXISTS pr_2;
+DROP TABLE IF EXISTS numbers_1e6;
 
 CREATE TABLE pr_1 (`a` UInt32) ENGINE = MergeTree ORDER BY a PARTITION BY a % 10 AS
 SELECT 10 * intDiv(number, 10) + 1 FROM numbers(1_000_000);
@@ -28,7 +29,7 @@ SETTINGS allow_experimental_analyzer = 0, allow_experimental_parallel_reading_fr
 SELECT count() FROM pr_2 JOIN numbers(10) as pr_1 ON pr_2.a = pr_1.number
 SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3;
 
--- Being a subquery should still disable parallel replicas
+-- Parallel replicas detection should work inside subqueries
 SELECT *
 FROM
 (
@@ -37,5 +38,44 @@ FROM
 )
 SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3;
 
+-- Subquery + subquery
+SELECT count()
+FROM
+(
+    SELECT c + 1
+    FROM
+    (
+        WITH filtered_groups AS (SELECT a FROM pr_1 WHERE a >= 10000)
+        SELECT count() as c FROM pr_2 INNER JOIN filtered_groups ON pr_2.a = filtered_groups.a
+    )
+)
+SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3;
+
+CREATE TABLE numbers_1e6
+(
+    `n` UInt64
+)
+ENGINE = MergeTree
+ORDER BY n
+AS SELECT * FROM numbers(1_000_000);
+
+-- Same but nested CTE's
+WITH
+    cte1 AS
+    (
+        SELECT n
+        FROM numbers_1e6
+    ),
+    cte2 AS
+    (
+        SELECT n
+        FROM numbers_1e6
+        WHERE n IN (cte1)
+    )
+SELECT count()
+FROM cte2
+SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3;
+
+DROP TABLE IF EXISTS numbers_1e6;
 DROP TABLE IF EXISTS pr_1;
 DROP TABLE IF EXISTS pr_2;

From 0b72f7b18201819b4997c675a7bcc2ac19654908 Mon Sep 17 00:00:00 2001
From: HowePa <2873679104@qq.com>
Date: Mon, 26 Feb 2024 22:46:51 +0800
Subject: [PATCH 03/66] Make all format names case insensitive.

---
 docs/en/interfaces/formats.md                 |  1 +
 src/Formats/FormatFactory.cpp                 | 32 ++++---
 src/Formats/FormatFactory.h                   |  5 +-
 src/Functions/formatRow.cpp                   |  4 +-
 .../00309_formats_case_insensitive.reference  | 95 +++++++++++++++++++
 .../00309_formats_case_insensitive.sql        | 23 +++++
 6 files changed, 145 insertions(+), 15 deletions(-)
 create mode 100644 tests/queries/0_stateless/00309_formats_case_insensitive.reference
 create mode 100644 tests/queries/0_stateless/00309_formats_case_insensitive.sql

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index 285737312bd..a76bb01ce9e 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -7,6 +7,7 @@ title: Formats for Input and Output Data
 
 ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read a dictionary. A format supported for output can be used to arrange the
 results of a `SELECT`, and to perform `INSERT`s into a file-backed table.
+All format names are case insensitive.
 
 The supported formats are:
 
diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 0654dd01e49..38b29bc6405 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -31,9 +31,18 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
+String FormatFactory::getOriginalFormatNameIfExists(const String & name) const
+{
+    String case_insensitive_format_name = boost::to_lower_copy(name);
+    auto it = file_extension_formats.find(case_insensitive_format_name);
+    if (file_extension_formats.end() != it)
+        return it->second;
+    return name;
+}
+
 const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
 {
-    auto it = dict.find(name);
+    auto it = dict.find(getOriginalFormatNameIfExists(name));
     if (dict.end() != it)
         return it->second;
     throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name);
@@ -542,7 +551,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader(
     const ContextPtr & context,
     const std::optional<FormatSettings> & _format_settings) const
 {
-    const auto & schema_reader_creator = dict.at(name).schema_reader_creator;
+    const auto & schema_reader_creator = getCreators(name).schema_reader_creator;
     if (!schema_reader_creator)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name);
 
@@ -558,7 +567,7 @@ ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
     const ContextPtr & context,
     const std::optional<FormatSettings> & _format_settings) const
 {
-    const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator;
+    const auto & external_schema_reader_creator = getCreators(name).external_schema_reader_creator;
     if (!external_schema_reader_creator)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} doesn't support schema inference.", name);
 
@@ -574,7 +583,7 @@ void FormatFactory::registerInputFormat(const String & name, InputCreator input_
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name);
     creators.input_creator = std::move(input_creator);
     registerFileExtension(name, name);
-    KnownFormatNames::instance().add(name);
+    KnownFormatNames::instance().add(name, /* case_insensitive = */ true);
 }
 
 void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomAccessInputCreator input_creator)
@@ -585,7 +594,7 @@ void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomA
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name);
     creators.random_access_input_creator = std::move(input_creator);
     registerFileExtension(name, name);
-    KnownFormatNames::instance().add(name);
+    KnownFormatNames::instance().add(name, /* case_insensitive = */ true);
 }
 
 void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker)
@@ -612,7 +621,7 @@ void FormatFactory::markFormatHasNoAppendSupport(const String & name)
 bool FormatFactory::checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional<FormatSettings> & format_settings_)
 {
     auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context);
-    auto & append_support_checker = dict[name].append_support_checker;
+    const auto & append_support_checker = getCreators(name).append_support_checker;
     /// By default we consider that format supports append
     return !append_support_checker || append_support_checker(format_settings);
 }
@@ -624,7 +633,7 @@ void FormatFactory::registerOutputFormat(const String & name, OutputCreator outp
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already registered", name);
     target = std::move(output_creator);
     registerFileExtension(name, name);
-    KnownFormatNames::instance().add(name);
+    KnownFormatNames::instance().add(name, /* case_insensitive = */ true);
 }
 
 void FormatFactory::registerFileExtension(const String & extension, const String & format_name)
@@ -791,13 +800,13 @@ String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, const
 
 bool FormatFactory::isInputFormat(const String & name) const
 {
-    auto it = dict.find(name);
+    auto it = dict.find(getOriginalFormatNameIfExists(name));
     return it != dict.end() && (it->second.input_creator || it->second.random_access_input_creator);
 }
 
 bool FormatFactory::isOutputFormat(const String & name) const
 {
-    auto it = dict.find(name);
+    auto it = dict.find(getOriginalFormatNameIfExists(name));
     return it != dict.end() && it->second.output_creator;
 }
 
@@ -826,7 +835,8 @@ bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) c
 
 bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const
 {
-    if (name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order)
+    auto format_name = getOriginalFormatNameIfExists(name);
+    if (format_name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order)
         return false;
 
     return true;
@@ -834,7 +844,7 @@ bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, cons
 
 void FormatFactory::checkFormatName(const String & name) const
 {
-    auto it = dict.find(name);
+    auto it = dict.find(getOriginalFormatNameIfExists(name));
     if (it == dict.end())
         throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name);
 }
diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h
index 165a20f7c4d..145f6258933 100644
--- a/src/Formats/FormatFactory.h
+++ b/src/Formats/FormatFactory.h
@@ -266,7 +266,7 @@ public:
 
 private:
     FormatsDictionary dict;
-    FileExtensionFormats file_extension_formats;
+    FileExtensionFormats file_extension_formats;    // Also used as a case-insensitive format_name mapping.
 
     const Creators & getCreators(const String & name) const;
 
@@ -279,6 +279,9 @@ private:
         const Settings & settings,
         bool is_remote_fs,
         size_t max_download_threads) const;
+
+    // Mapping case-insensitive format_name to a key in FormatsDictionary if exists.
+    String getOriginalFormatNameIfExists(const String & name) const;
 };
 
 }
diff --git a/src/Functions/formatRow.cpp b/src/Functions/formatRow.cpp
index 12a5fc2cc27..1ac6becfb15 100644
--- a/src/Functions/formatRow.cpp
+++ b/src/Functions/formatRow.cpp
@@ -18,7 +18,6 @@ namespace ErrorCodes
 {
     extern const int ILLEGAL_TYPE_OF_ARGUMENT;
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-    extern const int UNKNOWN_FORMAT;
     extern const int BAD_ARGUMENTS;
 }
 
@@ -40,8 +39,7 @@ public:
         , arguments_column_names(std::move(arguments_column_names_))
         , context(std::move(context_))
     {
-        if (!FormatFactory::instance().getAllFormats().contains(format_name))
-            throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", format_name);
+        FormatFactory::instance().checkFormatName(format_name);
     }
 
     String getName() const override { return name; }
diff --git a/tests/queries/0_stateless/00309_formats_case_insensitive.reference b/tests/queries/0_stateless/00309_formats_case_insensitive.reference
new file mode 100644
index 00000000000..b74d7002833
--- /dev/null
+++ b/tests/queries/0_stateless/00309_formats_case_insensitive.reference
@@ -0,0 +1,95 @@
+-- test FORMAT clause --
+0	Hello & world
+1	Hello & world
+2	Hello & world
+0,"Hello & world"
+1,"Hello & world"
+2,"Hello & world"
+<?xml version='1.0' encoding='UTF-8' ?>
+<result>
+	<meta>
+		<columns>
+			<column>
+				<name>number</name>
+				<type>UInt64</type>
+			</column>
+			<column>
+				<name>'Hello &amp; world'</name>
+				<type>String</type>
+			</column>
+		</columns>
+	</meta>
+	<data>
+		<row>
+			<number>0</number>
+			<field>Hello &amp; world</field>
+		</row>
+		<row>
+			<number>1</number>
+			<field>Hello &amp; world</field>
+		</row>
+		<row>
+			<number>2</number>
+			<field>Hello &amp; world</field>
+		</row>
+	</data>
+	<rows>3</rows>
+</result>
+{
+	"meta":
+	[
+		{
+			"name": "number",
+			"type": "UInt64"
+		},
+		{
+			"name": "'Hello & world'",
+			"type": "String"
+		}
+	],
+
+	"data":
+	[
+		{
+			"number": "0",
+			"'Hello & world'": "Hello & world"
+		},
+		{
+			"number": "1",
+			"'Hello & world'": "Hello & world"
+		},
+		{
+			"number": "2",
+			"'Hello & world'": "Hello & world"
+		}
+	],
+
+	"rows": 3
+}
+Row 1:
+──────
+number:          0
+'Hello & world': Hello & world
+
+Row 2:
+──────
+number:          1
+'Hello & world': Hello & world
+
+Row 3:
+──────
+number:          2
+'Hello & world': Hello & world
+-- test table function --
+0	Hello & world
+1	Hello & world
+2	Hello & world
+0	Hello & world
+1	Hello & world
+2	Hello & world
+-- test other function --
+0	Hello & world
+-- test table engine --
+0	Hello & world
+1	Hello & world
+2	Hello & world
diff --git a/tests/queries/0_stateless/00309_formats_case_insensitive.sql b/tests/queries/0_stateless/00309_formats_case_insensitive.sql
new file mode 100644
index 00000000000..b4037ed9861
--- /dev/null
+++ b/tests/queries/0_stateless/00309_formats_case_insensitive.sql
@@ -0,0 +1,23 @@
+SELECT '-- test FORMAT clause --';
+SET output_format_write_statistics = 0;
+SELECT number, 'Hello & world' FROM numbers(3) FORMAT Tsv;
+SELECT number, 'Hello & world' FROM numbers(3) FORMAT csv;
+SELECT number, 'Hello & world' FROM numbers(3) FORMAT xMl;
+SELECT number, 'Hello & world' FROM numbers(3) FORMAT JsonStrINGs;
+SELECT number, 'Hello & world' FROM numbers(3) FORMAT VERTICAL;
+
+SELECT '-- test table function --';
+INSERT INTO FUNCTION file('data_00309_formats_case_insensitive', 'Csv') SELECT number, 'Hello & world' FROM numbers(3) SETTINGS engine_file_truncate_on_insert=1;
+SELECT * FROM file('data_00309_formats_case_insensitive', 'Csv');
+
+INSERT INTO FUNCTION file('data_00309_formats_case_insensitive.cSv') SELECT number, 'Hello & world' FROM numbers(3) SETTINGS engine_file_truncate_on_insert=1;
+SELECT * FROM file('data_00309_formats_case_insensitive.cSv');
+
+SELECT '-- test other function --';
+SELECT * FROM format(cSv, '0,Hello & world');
+
+SELECT '-- test table engine --';
+DROP TABLE IF EXISTS test_00309_formats_case_insensitive;
+CREATE TABLE test_00309_formats_case_insensitive(a Int64, b String) ENGINE=File(Csv);
+INSERT INTO test_00309_formats_case_insensitive SELECT number, 'Hello & world' FROM numbers(3);
+SELECT * FROM test_00309_formats_case_insensitive;

From ea89fa0de9a69d68e1a29447ed3d022ed9a3bf84 Mon Sep 17 00:00:00 2001
From: HowePa <2873679104@qq.com>
Date: Mon, 26 Feb 2024 23:07:50 +0800
Subject: [PATCH 04/66] replace to checkFormatName

---
 src/Storages/StorageAzureBlob.cpp                 |  8 ++++----
 src/Storages/StorageS3.cpp                        |  7 ++++---
 .../TableFunctionAzureBlobStorage.cpp             |  5 +++--
 src/TableFunctions/TableFunctionS3.cpp            | 15 ++++++++-------
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp
index f5fcf01c59e..94bb5d3cf60 100644
--- a/src/Storages/StorageAzureBlob.cpp
+++ b/src/Storages/StorageAzureBlob.cpp
@@ -68,7 +68,7 @@ namespace ErrorCodes
     extern const int CANNOT_DETECT_FORMAT;
     extern const int LOGICAL_ERROR;
     extern const int NOT_IMPLEMENTED;
-
+    extern const int UNKNOWN_FORMAT;
 }
 
 namespace
@@ -167,7 +167,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine
 
     auto is_format_arg = [] (const std::string & s) -> bool
     {
-        return s == "auto" || FormatFactory::instance().getAllFormats().contains(s);
+        return s == "auto" || FormatFactory::instance().checkFormatName(s);
     };
 
     if (engine_args.size() == 4)
@@ -200,7 +200,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine
     else if (engine_args.size() == 6)
     {
         auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "format/account_name");
-        if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg))
+        if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
         {
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments");
         }
@@ -218,7 +218,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine
     else if (engine_args.size() == 7)
     {
         auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "format/account_name");
-        if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg))
+        if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
         {
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments");
         }
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 2d8ef3df1c8..07f68072bb6 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -133,6 +133,7 @@ namespace ErrorCodes
     extern const int NOT_IMPLEMENTED;
     extern const int CANNOT_COMPILE_REGEXP;
     extern const int FILE_DOESNT_EXIST;
+    extern const int UNKNOWN_FORMAT;
 }
 
 
@@ -1531,7 +1532,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const C
                 no_sign_request = true;
                 engine_args_to_idx = {{"format", 2}};
             }
-            else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
                 engine_args_to_idx = {{"format", 1}, {"compression_method", 2}};
             else
                 engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}};
@@ -1552,7 +1553,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const C
             else
             {
                 auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "session_token/format");
-                if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg))
+                if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
                 {
                     engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}};
                 }
@@ -1568,7 +1569,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const C
         else if (count == 5)
         {
             auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "session_token/format");
-            if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg))
+            if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
             {
                 engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"compression", 4}};
             }
diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp
index 066d6338b6a..ac96364b5bd 100644
--- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp
+++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp
@@ -32,6 +32,7 @@ namespace ErrorCodes
 {
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
     extern const int BAD_ARGUMENTS;
+    extern const int UNKNOWN_FORMAT;
 }
 
 namespace
@@ -80,7 +81,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const
         configuration.blob_path = checkAndGetLiteralArgument<String>(engine_args[2], "blobpath");
 
         auto is_format_arg
-            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); };
+            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().checkFormatName(s); };
 
         if (engine_args.size() == 4)
         {
@@ -207,7 +208,7 @@ void TableFunctionAzureBlobStorage::updateStructureAndFormatArgumentsIfNeeded(AS
             arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context);
 
         auto is_format_arg
-            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); };
+            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().checkFormatName(s); };
 
         /// (connection_string, container_name, blobpath)
         if (args.size() == 3)
diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp
index 3fedd38277c..04182fa4e68 100644
--- a/src/TableFunctions/TableFunctionS3.cpp
+++ b/src/TableFunctions/TableFunctionS3.cpp
@@ -31,6 +31,7 @@ namespace ErrorCodes
 {
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
     extern const int LOGICAL_ERROR;
+    extern const int UNKNOWN_FORMAT;
 }
 
 
@@ -100,7 +101,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
                 no_sign_request = true;
                 args_to_idx = {{"format", 2}};
             }
-            else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
                 args_to_idx = {{"format", 1}, {"structure", 2}};
             else
                 args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}};
@@ -119,14 +120,14 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
                 no_sign_request = true;
                 args_to_idx = {{"format", 2}, {"structure", 3}};
             }
-            else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
             {
                 args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}};
             }
             else
             {
                 auto fourth_arg = checkAndGetLiteralArgument<String>(args[3], "format/session_token");
-                if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg))
+                if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
                 {
                     args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}};
                 }
@@ -153,7 +154,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
             else
             {
                 auto fourth_arg = checkAndGetLiteralArgument<String>(args[3], "format/session_token");
-                if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg))
+                if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
                 {
                     args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}};
                 }
@@ -170,7 +171,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
         else if (count == 6)
         {
             auto fourth_arg = checkAndGetLiteralArgument<String>(args[3], "format/session_token");
-            if (fourth_arg == "auto" || FormatFactory::instance().getAllFormats().contains(fourth_arg))
+            if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
             {
                 args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}};
             }
@@ -300,7 +301,7 @@ void TableFunctionS3::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, con
                 args.push_back(structure_literal);
             }
             /// s3(source, format, structure)
-            else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
             {
                 if (second_arg == "auto")
                     args[1] = format_literal;
@@ -330,7 +331,7 @@ void TableFunctionS3::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, con
                     args[3] = structure_literal;
             }
             /// s3(source, format, structure, compression_method)
-            else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
             {
                 if (second_arg == "auto")
                     args[1] = format_literal;

From dbd8d35f01a55389c4fbfbcdf1b2d0b9f9b703ba Mon Sep 17 00:00:00 2001
From: HowePa <2873679104@qq.com>
Date: Tue, 27 Feb 2024 00:48:34 +0800
Subject: [PATCH 05/66] use lower case in dict

---
 src/Formats/FormatFactory.cpp                 | 64 +++++++++++--------
 src/Formats/FormatFactory.h                   |  8 +--
 src/Storages/StorageAzureBlob.cpp             |  7 +-
 src/Storages/StorageS3.cpp                    |  7 +-
 src/Storages/System/StorageSystemFormats.cpp  |  3 +-
 .../TableFunctionAzureBlobStorage.cpp         |  5 +-
 src/TableFunctions/TableFunctionS3.cpp        | 15 ++---
 7 files changed, 59 insertions(+), 50 deletions(-)

diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 38b29bc6405..2bead318173 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -31,23 +31,35 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
-String FormatFactory::getOriginalFormatNameIfExists(const String & name) const
+bool FormatFactory::exists(const String & name) const
 {
-    String case_insensitive_format_name = boost::to_lower_copy(name);
-    auto it = file_extension_formats.find(case_insensitive_format_name);
-    if (file_extension_formats.end() != it)
-        return it->second;
-    return name;
+    return dict.find(boost::to_lower_copy(name)) != dict.end();
 }
 
 const FormatFactory::Creators & FormatFactory::getCreators(const String & name) const
 {
-    auto it = dict.find(getOriginalFormatNameIfExists(name));
+    auto it = dict.find(boost::to_lower_copy(name));
     if (dict.end() != it)
         return it->second;
     throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name);
 }
 
+FormatFactory::Creators & FormatFactory::getOrCreateCreators(const String & name)
+{
+    String lower_case = boost::to_lower_copy(name);
+    auto it = dict.find(lower_case);
+    if (dict.end() != it)
+    {
+        return it->second;
+    }
+    else
+    {
+        auto & creators = dict[lower_case];
+        creators.name = name;
+        return creators;
+    }
+}
+
 FormatSettings getFormatSettings(const ContextPtr & context)
 {
     const auto & settings = context->getSettingsRef();
@@ -578,7 +590,7 @@ ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader(
 void FormatFactory::registerInputFormat(const String & name, InputCreator input_creator)
 {
     chassert(input_creator);
-    auto & creators = dict[name];
+    auto & creators = getOrCreateCreators(name);
     if (creators.input_creator || creators.random_access_input_creator)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name);
     creators.input_creator = std::move(input_creator);
@@ -589,7 +601,7 @@ void FormatFactory::registerInputFormat(const String & name, InputCreator input_
 void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomAccessInputCreator input_creator)
 {
     chassert(input_creator);
-    auto & creators = dict[name];
+    auto & creators = getOrCreateCreators(name);
     if (creators.input_creator || creators.random_access_input_creator)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Input format {} is already registered", name);
     creators.random_access_input_creator = std::move(input_creator);
@@ -599,7 +611,7 @@ void FormatFactory::registerRandomAccessInputFormat(const String & name, RandomA
 
 void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name, NonTrivialPrefixAndSuffixChecker non_trivial_prefix_and_suffix_checker)
 {
-    auto & target = dict[name].non_trivial_prefix_and_suffix_checker;
+    auto & target = getOrCreateCreators(name).non_trivial_prefix_and_suffix_checker;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Non trivial prefix and suffix checker {} is already registered", name);
     target = std::move(non_trivial_prefix_and_suffix_checker);
@@ -607,7 +619,7 @@ void FormatFactory::registerNonTrivialPrefixAndSuffixChecker(const String & name
 
 void FormatFactory::registerAppendSupportChecker(const String & name, AppendSupportChecker append_support_checker)
 {
-    auto & target = dict[name].append_support_checker;
+    auto & target = getOrCreateCreators(name).append_support_checker;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Suffix checker {} is already registered", name);
     target = std::move(append_support_checker);
@@ -628,7 +640,7 @@ bool FormatFactory::checkIfFormatSupportAppend(const String & name, const Contex
 
 void FormatFactory::registerOutputFormat(const String & name, OutputCreator output_creator)
 {
-    auto & target = dict[name].output_creator;
+    auto & target = getOrCreateCreators(name).output_creator;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already registered", name);
     target = std::move(output_creator);
@@ -705,7 +717,7 @@ String FormatFactory::getFormatFromFileDescriptor(int fd)
 
 void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine)
 {
-    auto & target = dict[name].file_segmentation_engine_creator;
+    auto & target = getOrCreateCreators(name).file_segmentation_engine_creator;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: File segmentation engine {} is already registered", name);
     auto creator = [file_segmentation_engine](const FormatSettings &)
@@ -717,7 +729,7 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm
 
 void FormatFactory::registerFileSegmentationEngineCreator(const String & name, FileSegmentationEngineCreator file_segmentation_engine_creator)
 {
-    auto & target = dict[name].file_segmentation_engine_creator;
+    auto & target = getOrCreateCreators(name).file_segmentation_engine_creator;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: File segmentation engine creator {} is already registered", name);
     target = std::move(file_segmentation_engine_creator);
@@ -725,7 +737,7 @@ void FormatFactory::registerFileSegmentationEngineCreator(const String & name, F
 
 void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator)
 {
-    auto & target = dict[name].schema_reader_creator;
+    auto & target = getOrCreateCreators(name).schema_reader_creator;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name);
     target = std::move(schema_reader_creator);
@@ -733,7 +745,7 @@ void FormatFactory::registerSchemaReader(const String & name, SchemaReaderCreato
 
 void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSchemaReaderCreator external_schema_reader_creator)
 {
-    auto & target = dict[name].external_schema_reader_creator;
+    auto & target = getOrCreateCreators(name).external_schema_reader_creator;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Schema reader {} is already registered", name);
     target = std::move(external_schema_reader_creator);
@@ -741,7 +753,7 @@ void FormatFactory::registerExternalSchemaReader(const String & name, ExternalSc
 
 void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name)
 {
-    auto & target = dict[name].supports_parallel_formatting;
+    auto & target = getOrCreateCreators(name).supports_parallel_formatting;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Output format {} is already marked as supporting parallel formatting", name);
     target = true;
@@ -750,7 +762,7 @@ void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & na
 
 void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name)
 {
-    auto & target = dict[name].subset_of_columns_support_checker;
+    auto & target = getOrCreateCreators(name).subset_of_columns_support_checker;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
     target = [](const FormatSettings &){ return true; };
@@ -758,7 +770,7 @@ void FormatFactory::markFormatSupportsSubsetOfColumns(const String & name)
 
 void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, SubsetOfColumnsSupportChecker subset_of_columns_support_checker)
 {
-    auto & target = dict[name].subset_of_columns_support_checker;
+    auto & target = getOrCreateCreators(name).subset_of_columns_support_checker;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as supporting subset of columns", name);
     target = std::move(subset_of_columns_support_checker);
@@ -766,7 +778,7 @@ void FormatFactory::registerSubsetOfColumnsSupportChecker(const String & name, S
 
 void FormatFactory::markOutputFormatPrefersLargeBlocks(const String & name)
 {
-    auto & target = dict[name].prefers_large_blocks;
+    auto & target = getOrCreateCreators(name).prefers_large_blocks;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: Format {} is already marked as preferring large blocks", name);
     target = true;
@@ -782,7 +794,7 @@ bool FormatFactory::checkIfFormatSupportsSubsetOfColumns(const String & name, co
 void FormatFactory::registerAdditionalInfoForSchemaCacheGetter(
     const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter)
 {
-    auto & target = dict[name].additional_info_for_schema_cache_getter;
+    auto & target = getOrCreateCreators(name).additional_info_for_schema_cache_getter;
     if (target)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "FormatFactory: additional info for schema cache getter {} is already registered", name);
     target = std::move(additional_info_for_schema_cache_getter);
@@ -800,13 +812,13 @@ String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, const
 
 bool FormatFactory::isInputFormat(const String & name) const
 {
-    auto it = dict.find(getOriginalFormatNameIfExists(name));
+    auto it = dict.find(boost::to_lower_copy(name));
     return it != dict.end() && (it->second.input_creator || it->second.random_access_input_creator);
 }
 
 bool FormatFactory::isOutputFormat(const String & name) const
 {
-    auto it = dict.find(getOriginalFormatNameIfExists(name));
+    auto it = dict.find(boost::to_lower_copy(name));
     return it != dict.end() && it->second.output_creator;
 }
 
@@ -835,8 +847,8 @@ bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) c
 
 bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const
 {
-    auto format_name = getOriginalFormatNameIfExists(name);
-    if (format_name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order)
+    auto format_name = boost::to_lower_copy(name);
+    if (format_name == "parquet" && context->getSettingsRef().input_format_parquet_preserve_order)
         return false;
 
     return true;
@@ -844,7 +856,7 @@ bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, cons
 
 void FormatFactory::checkFormatName(const String & name) const
 {
-    auto it = dict.find(getOriginalFormatNameIfExists(name));
+    auto it = dict.find(boost::to_lower_copy(name));
     if (it == dict.end())
         throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name);
 }
diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h
index 145f6258933..46c1b8ddcdd 100644
--- a/src/Formats/FormatFactory.h
+++ b/src/Formats/FormatFactory.h
@@ -132,6 +132,7 @@ private:
 
     struct Creators
     {
+        String name;
         InputCreator input_creator;
         RandomAccessInputCreator random_access_input_creator;
         OutputCreator output_creator;
@@ -263,12 +264,14 @@ public:
 
     /// Check that format with specified name exists and throw an exception otherwise.
     void checkFormatName(const String & name) const;
+    bool exists(const String & name) const;
 
 private:
     FormatsDictionary dict;
-    FileExtensionFormats file_extension_formats;    // Also used as a case-insensitive format_name mapping.
+    FileExtensionFormats file_extension_formats;
 
     const Creators & getCreators(const String & name) const;
+    Creators & getOrCreateCreators(const String & name);
 
     // Creates a ReadBuffer to give to an input format. Returns nullptr if we should use `buf` directly.
     std::unique_ptr<ReadBuffer> wrapReadBufferIfNeeded(
@@ -279,9 +282,6 @@ private:
         const Settings & settings,
         bool is_remote_fs,
         size_t max_download_threads) const;
-
-    // Mapping case-insensitive format_name to a key in FormatsDictionary if exists.
-    String getOriginalFormatNameIfExists(const String & name) const;
 };
 
 }
diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp
index 94bb5d3cf60..d484fefc46f 100644
--- a/src/Storages/StorageAzureBlob.cpp
+++ b/src/Storages/StorageAzureBlob.cpp
@@ -68,7 +68,6 @@ namespace ErrorCodes
     extern const int CANNOT_DETECT_FORMAT;
     extern const int LOGICAL_ERROR;
     extern const int NOT_IMPLEMENTED;
-    extern const int UNKNOWN_FORMAT;
 }
 
 namespace
@@ -167,7 +166,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine
 
     auto is_format_arg = [] (const std::string & s) -> bool
     {
-        return s == "auto" || FormatFactory::instance().checkFormatName(s);
+        return s == "auto" || FormatFactory::instance().exists(s);
     };
 
     if (engine_args.size() == 4)
@@ -200,7 +199,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine
     else if (engine_args.size() == 6)
     {
         auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "format/account_name");
-        if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
+        if (fourth_arg == "auto" || FormatFactory::instance().exists(fourth_arg))
         {
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments");
         }
@@ -218,7 +217,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine
     else if (engine_args.size() == 7)
     {
         auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "format/account_name");
-        if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
+        if (fourth_arg == "auto" || FormatFactory::instance().exists(fourth_arg))
         {
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments");
         }
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 07f68072bb6..e59a09efb20 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -133,7 +133,6 @@ namespace ErrorCodes
     extern const int NOT_IMPLEMENTED;
     extern const int CANNOT_COMPILE_REGEXP;
     extern const int FILE_DOESNT_EXIST;
-    extern const int UNKNOWN_FORMAT;
 }
 
 
@@ -1532,7 +1531,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const C
                 no_sign_request = true;
                 engine_args_to_idx = {{"format", 2}};
             }
-            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().exists(second_arg))
                 engine_args_to_idx = {{"format", 1}, {"compression_method", 2}};
             else
                 engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}};
@@ -1553,7 +1552,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const C
             else
             {
                 auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "session_token/format");
-                if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
+                if (fourth_arg == "auto" || FormatFactory::instance().exists(fourth_arg))
                 {
                     engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}};
                 }
@@ -1569,7 +1568,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const C
         else if (count == 5)
         {
             auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "session_token/format");
-            if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
+            if (fourth_arg == "auto" || FormatFactory::instance().exists(fourth_arg))
             {
                 engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"compression", 4}};
             }
diff --git a/src/Storages/System/StorageSystemFormats.cpp b/src/Storages/System/StorageSystemFormats.cpp
index a360971e1f7..849e4eadf78 100644
--- a/src/Storages/System/StorageSystemFormats.cpp
+++ b/src/Storages/System/StorageSystemFormats.cpp
@@ -23,7 +23,8 @@ void StorageSystemFormats::fillData(MutableColumns & res_columns, ContextPtr, co
     const auto & formats = FormatFactory::instance().getAllFormats();
     for (const auto & pair : formats)
     {
-        const auto & [format_name, creators] = pair;
+        const auto & [name, creators] = pair;
+        String format_name = creators.name;
         UInt64 has_input_format(creators.input_creator != nullptr || creators.random_access_input_creator != nullptr);
         UInt64 has_output_format(creators.output_creator != nullptr);
         UInt64 supports_parallel_parsing(creators.file_segmentation_engine_creator != nullptr || creators.random_access_input_creator != nullptr);
diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp
index ac96364b5bd..8f558adb09b 100644
--- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp
+++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp
@@ -32,7 +32,6 @@ namespace ErrorCodes
 {
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
     extern const int BAD_ARGUMENTS;
-    extern const int UNKNOWN_FORMAT;
 }
 
 namespace
@@ -81,7 +80,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const
         configuration.blob_path = checkAndGetLiteralArgument<String>(engine_args[2], "blobpath");
 
         auto is_format_arg
-            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().checkFormatName(s); };
+            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().exists(s); };
 
         if (engine_args.size() == 4)
         {
@@ -208,7 +207,7 @@ void TableFunctionAzureBlobStorage::updateStructureAndFormatArgumentsIfNeeded(AS
             arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context);
 
         auto is_format_arg
-            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().checkFormatName(s); };
+            = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().exists(s); };
 
         /// (connection_string, container_name, blobpath)
         if (args.size() == 3)
diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp
index 04182fa4e68..c00b1e2e3e5 100644
--- a/src/TableFunctions/TableFunctionS3.cpp
+++ b/src/TableFunctions/TableFunctionS3.cpp
@@ -31,7 +31,6 @@ namespace ErrorCodes
 {
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
     extern const int LOGICAL_ERROR;
-    extern const int UNKNOWN_FORMAT;
 }
 
 
@@ -101,7 +100,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
                 no_sign_request = true;
                 args_to_idx = {{"format", 2}};
             }
-            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().exists(second_arg))
                 args_to_idx = {{"format", 1}, {"structure", 2}};
             else
                 args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}};
@@ -120,14 +119,14 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
                 no_sign_request = true;
                 args_to_idx = {{"format", 2}, {"structure", 3}};
             }
-            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().exists(second_arg))
             {
                 args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}};
             }
             else
             {
                 auto fourth_arg = checkAndGetLiteralArgument<String>(args[3], "format/session_token");
-                if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
+                if (fourth_arg == "auto" || FormatFactory::instance().exists(fourth_arg))
                 {
                     args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}};
                 }
@@ -154,7 +153,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
             else
             {
                 auto fourth_arg = checkAndGetLiteralArgument<String>(args[3], "format/session_token");
-                if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
+                if (fourth_arg == "auto" || FormatFactory::instance().exists(fourth_arg))
                 {
                     args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}};
                 }
@@ -171,7 +170,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context
         else if (count == 6)
         {
             auto fourth_arg = checkAndGetLiteralArgument<String>(args[3], "format/session_token");
-            if (fourth_arg == "auto" || FormatFactory::instance().checkFormatName(fourth_arg))
+            if (fourth_arg == "auto" || FormatFactory::instance().exists(fourth_arg))
             {
                 args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}};
             }
@@ -301,7 +300,7 @@ void TableFunctionS3::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, con
                 args.push_back(structure_literal);
             }
             /// s3(source, format, structure)
-            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().exists(second_arg))
             {
                 if (second_arg == "auto")
                     args[1] = format_literal;
@@ -331,7 +330,7 @@ void TableFunctionS3::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, con
                     args[3] = structure_literal;
             }
             /// s3(source, format, structure, compression_method)
-            else if (second_arg == "auto" || FormatFactory::instance().checkFormatName(second_arg))
+            else if (second_arg == "auto" || FormatFactory::instance().exists(second_arg))
             {
                 if (second_arg == "auto")
                     args[1] = format_literal;

From 24155c80c987356fb6f71060563932a9ede6a14c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B1=AA=E8=82=A5=E8=82=A5?= <howepa@qq.com>
Date: Tue, 27 Feb 2024 07:50:04 +0800
Subject: [PATCH 06/66] Update src/Formats/FormatFactory.cpp

Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
---
 src/Formats/FormatFactory.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 2bead318173..527e0a20753 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -49,14 +49,11 @@ FormatFactory::Creators & FormatFactory::getOrCreateCreators(const String & name
     String lower_case = boost::to_lower_copy(name);
     auto it = dict.find(lower_case);
     if (dict.end() != it)
-    {
         return it->second;
-    }
-    else
-    {
-        auto & creators = dict[lower_case];
-        creators.name = name;
-        return creators;
+
+    auto & creators = dict[lower_case];
+    creators.name = name;
+    return creators;
     }
 }
 

From 6f9cb058a6ff0306fd4b2bd27ec0057185697f9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B1=AA=E8=82=A5=E8=82=A5?= <howepa@qq.com>
Date: Tue, 27 Feb 2024 07:59:09 +0800
Subject: [PATCH 07/66] Update FormatFactory.cpp

---
 src/Formats/FormatFactory.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 527e0a20753..3303a0a4b66 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -54,7 +54,6 @@ FormatFactory::Creators & FormatFactory::getOrCreateCreators(const String & name
     auto & creators = dict[lower_case];
     creators.name = name;
     return creators;
-    }
 }
 
 FormatSettings getFormatSettings(const ContextPtr & context)

From c395e4f52f9ea4d68dfd08585053d63f8dbeae31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Tue, 27 Feb 2024 12:20:44 +0100
Subject: [PATCH 08/66] Add missing reference

---
 tests/queries/0_stateless/02972_parallel_replicas_cte.reference | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/queries/0_stateless/02972_parallel_replicas_cte.reference b/tests/queries/0_stateless/02972_parallel_replicas_cte.reference
index 3321ade3a24..bbb5a960463 100644
--- a/tests/queries/0_stateless/02972_parallel_replicas_cte.reference
+++ b/tests/queries/0_stateless/02972_parallel_replicas_cte.reference
@@ -2,3 +2,5 @@
 990000
 10
 990000
+1
+1000000

From c9dd6fe8d57b20918b75e4c8ef06094af0bad229 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Tue, 27 Feb 2024 12:35:14 +0100
Subject: [PATCH 09/66] Set max_parallel_replicas to 1 when disabling

---
 src/Interpreters/InterpreterSelectQuery.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index 1a9827d30f8..1c9b8d911d9 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -885,7 +885,7 @@ bool InterpreterSelectQuery::adjustParallelReplicasAfterAnalysis()
     {
         /// The query could use trivial count if it didn't use parallel replicas, so let's disable it and reanalyze
         context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0));
-        context->setSetting("max_parallel_replicas", UInt64{0});
+        context->setSetting("max_parallel_replicas", UInt64{1});
         LOG_DEBUG(log, "Disabling parallel replicas to be able to use a trivial count optimization");
         return true;
     }

From dcafa2a3b3733cc612d21baf58d7998fe31a1fb9 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <igor@clickhouse.com>
Date: Wed, 28 Feb 2024 10:56:28 +0000
Subject: [PATCH 10/66] Fix: test_parallel_replicas_custom_key_load_balancing

---
 .../test.py                                       | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/integration/test_parallel_replicas_custom_key_load_balancing/test.py b/tests/integration/test_parallel_replicas_custom_key_load_balancing/test.py
index b9d4d029703..d5e17103296 100644
--- a/tests/integration/test_parallel_replicas_custom_key_load_balancing/test.py
+++ b/tests/integration/test_parallel_replicas_custom_key_load_balancing/test.py
@@ -109,10 +109,13 @@ def test_parallel_replicas_custom_key_load_balancing(
         == "subqueries\t4\n"
     )
 
-    # check queries per node
-    assert (
-        node1.query(
-            f"SELECT h, count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' GROUP BY hostname() as h ORDER BY h SETTINGS skip_unavailable_shards=1"
+    # With enabled hedged requests, we can't guarantee exact query distribution among nodes
+    # In case of a replica being slow in terms of responsiveness, hedged connection can change initial replicas choice
+    if use_hedged_requests == 0:
+        # check queries per node
+        assert (
+            node1.query(
+                f"SELECT h, count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' GROUP BY hostname() as h ORDER BY h SETTINGS skip_unavailable_shards=1"
+            )
+            == "n1\t2\nn2\t1\nn3\t1\nn4\t1\n"
         )
-        == "n1\t2\nn2\t1\nn3\t1\nn4\t1\n"
-    )

From f86213ecb615aa0949b878c3ac58ec63560dd886 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Wed, 28 Feb 2024 16:29:24 +0100
Subject: [PATCH 11/66] Cancel PipelineExecutor properly in case of exception
 in spawnThreads

---
 src/Processors/Executors/PipelineExecutor.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp
index a06bacd7d3b..c3fbe6788c6 100644
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@@ -399,7 +399,18 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control)
 
     if (num_threads > 1)
     {
-        spawnThreads(); // start at least one thread
+        try
+        {
+            spawnThreads(); // start at least one thread
+        }
+        catch (...)
+        {
+            /// spawnThreads can throw an exception, for example CANNOT_SCHEDULE_TASK.
+            /// We should cancel execution properly before rethrow.
+            cancel();
+            throw;
+        }
+        
         tasks.processAsyncTasks();
         pool->wait();
     }

From 0c902f8d648789f8cf1117c975f5db2bba1898b4 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Wed, 28 Feb 2024 18:28:54 +0100
Subject: [PATCH 12/66] Fix style

---
 src/Processors/Executors/PipelineExecutor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp
index c3fbe6788c6..8477e011763 100644
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@@ -410,7 +410,7 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control)
             cancel();
             throw;
         }
-        
+
         tasks.processAsyncTasks();
         pool->wait();
     }

From 7077499064538a43617e56b28b21411b9ee11828 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <igor@clickhouse.com>
Date: Wed, 28 Feb 2024 20:50:56 +0000
Subject: [PATCH 13/66] PullingAsyncPipelineExecutor cleanup

lazy_format is used always
---
 .../PullingAsyncPipelineExecutor.cpp          | 37 +++----------------
 .../Executors/PushingAsyncPipelineExecutor.h  |  1 -
 2 files changed, 6 insertions(+), 32 deletions(-)

diff --git a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp
index 345bec395b2..d27002197d2 100644
--- a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp
+++ b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp
@@ -23,7 +23,6 @@ struct PullingAsyncPipelineExecutor::Data
     std::atomic_bool is_finished = false;
     std::atomic_bool has_exception = false;
     ThreadFromGlobalPool thread;
-    Poco::Event finish_event;
 
     ~Data()
     {
@@ -89,12 +88,10 @@ static void threadFunction(
         data.has_exception = true;
 
         /// Finish lazy format in case of exception. Otherwise thread.join() may hung.
-        if (data.lazy_format)
-            data.lazy_format->finalize();
+        data.lazy_format->finalize();
     }
 
     data.is_finished = true;
-    data.finish_event.set();
 }
 
 
@@ -129,20 +126,8 @@ bool PullingAsyncPipelineExecutor::pull(Chunk & chunk, uint64_t milliseconds)
         return false;
     }
 
-    if (lazy_format)
-    {
-        chunk = lazy_format->getChunk(milliseconds);
-        data->rethrowExceptionIfHas();
-        return true;
-    }
-
-    chunk.clear();
-
-    if (milliseconds)
-        data->finish_event.tryWait(milliseconds);
-    else
-        data->finish_event.wait();
-
+    chunk = lazy_format->getChunk(milliseconds);
+    data->rethrowExceptionIfHas();
     return true;
 }
 
@@ -230,14 +215,12 @@ void PullingAsyncPipelineExecutor::cancelWithExceptionHandling(CancelFunc && can
 
 Chunk PullingAsyncPipelineExecutor::getTotals()
 {
-    return lazy_format ? lazy_format->getTotals()
-                       : Chunk();
+    return lazy_format->getTotals();
 }
 
 Chunk PullingAsyncPipelineExecutor::getExtremes()
 {
-    return lazy_format ? lazy_format->getExtremes()
-                       : Chunk();
+    return lazy_format->getExtremes();
 }
 
 Block PullingAsyncPipelineExecutor::getTotalsBlock()
@@ -264,15 +247,7 @@ Block PullingAsyncPipelineExecutor::getExtremesBlock()
 
 ProfileInfo & PullingAsyncPipelineExecutor::getProfileInfo()
 {
-    if (lazy_format)
-        return lazy_format->getProfileInfo();
-
-    static ProfileInfo profile_info;
-    static std::once_flag flag;
-    /// Calculate rows before limit here to avoid race.
-    std::call_once(flag, []() { profile_info.getRowsBeforeLimit(); });
-
-    return profile_info;
+    return lazy_format->getProfileInfo();
 }
 
 }
diff --git a/src/Processors/Executors/PushingAsyncPipelineExecutor.h b/src/Processors/Executors/PushingAsyncPipelineExecutor.h
index 4b4b83a90b5..f976cd4c339 100644
--- a/src/Processors/Executors/PushingAsyncPipelineExecutor.h
+++ b/src/Processors/Executors/PushingAsyncPipelineExecutor.h
@@ -1,6 +1,5 @@
 #pragma once
 #include <memory>
-#include <atomic>
 #include <vector>
 
 namespace DB

From 763bd227259c8f54c0babcb13916ac7dc6c8205a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 28 Feb 2024 23:43:03 +0100
Subject: [PATCH 14/66] Synchronize metrics and Keeper

---
 src/Common/CurrentMetrics.cpp            | 12 +++++-
 src/Common/ErrorCodes.cpp                |  4 ++
 src/Common/FailPoint.cpp                 |  8 ++++
 src/Common/ProfileEvents.cpp             | 48 +++++++++++++++++++++++-
 src/Common/SystemLogBase.cpp             |  1 +
 src/Common/SystemLogBase.h               |  1 +
 src/Common/ThreadStatus.cpp              |  5 ++-
 src/Common/ZooKeeper/IKeeper.h           |  5 +++
 src/Common/ZooKeeper/TestKeeper.cpp      | 11 ++++++
 src/Common/ZooKeeper/TestKeeper.h        |  4 ++
 src/Common/ZooKeeper/ZooKeeper.cpp       |  5 +++
 src/Common/ZooKeeper/ZooKeeper.h         |  1 +
 src/Common/ZooKeeper/ZooKeeperCommon.cpp | 10 +++++
 src/Common/ZooKeeper/ZooKeeperCommon.h   |  7 +---
 src/Common/ZooKeeper/ZooKeeperImpl.cpp   |  7 ++++
 src/Common/ZooKeeper/ZooKeeperImpl.h     |  4 ++
 src/Common/ZooKeeper/ZooKeeperRetries.h  | 15 ++++++++
 17 files changed, 139 insertions(+), 9 deletions(-)

diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index 6931001202d..82da4c4bbad 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -264,7 +264,17 @@
     M(RefreshingViews, "Number of materialized views currently executing a refresh") \
     M(StorageBufferFlushThreads, "Number of threads for background flushes in StorageBuffer") \
     M(StorageBufferFlushThreadsActive, "Number of threads for background flushes in StorageBuffer running a task") \
-    M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer")
+    M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer") \
+    M(SharedMergeTreeThreads, "Number of threads in the thread pools in internals of SharedMergeTree") \
+    M(SharedMergeTreeThreadsActive, "Number of threads in the thread pools in internals of SharedMergeTree running a task") \
+    M(SharedMergeTreeThreadsScheduled, "Number of queued or active threads in the thread pools in internals of SharedMergeTree") \
+    M(SharedMergeTreeFetch, "Number of fetches in progress") \
+    M(CacheWarmerBytesInProgress, "Total size of remote file segments waiting to be asynchronously loaded into filesystem cache.") \
+    M(DistrCacheOpenedConnections, "Number of open connections to Distributed Cache") \
+    M(DistrCacheUsedConnections, "Number of currently used connections to Distributed Cache") \
+    M(DistrCacheReadRequests, "Number of executed Read requests to Distributed Cache") \
+    M(DistrCacheWriteRequests, "Number of executed Write requests to Distributed Cache") \
+    M(DistrCacheServerConnections, "Number of open connections to ClickHouse server from Distributed Cache")
 
 #ifdef APPLY_FOR_EXTERNAL_METRICS
     #define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)
diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index ca00f2fd513..eca4db2307c 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -585,6 +585,10 @@
     M(703, INVALID_IDENTIFIER) \
     M(704, QUERY_CACHE_USED_WITH_NONDETERMINISTIC_FUNCTIONS) \
     M(705, TABLE_NOT_EMPTY) \
+    \
+    M(900, DISTRIBUTED_CACHE_ERROR) \
+    M(901, CANNOT_USE_DISTRIBUTED_CACHE) \
+    \
     M(706, LIBSSH_ERROR) \
     M(707, GCP_ERROR) \
     M(708, ILLEGAL_STATISTIC) \
diff --git a/src/Common/FailPoint.cpp b/src/Common/FailPoint.cpp
index a23133b7522..9e551c8f2cd 100644
--- a/src/Common/FailPoint.cpp
+++ b/src/Common/FailPoint.cpp
@@ -39,6 +39,14 @@ static struct InitFiu
     REGULAR(replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault) \
     REGULAR(use_delayed_remote_source) \
     REGULAR(cluster_discovery_faults) \
+    ONCE(smt_commit_merge_mutate_zk_fail_after_op) \
+    ONCE(smt_commit_merge_mutate_zk_fail_before_op) \
+    ONCE(smt_commit_write_zk_fail_after_op) \
+    ONCE(smt_commit_write_zk_fail_before_op) \
+    ONCE(smt_commit_merge_change_version_before_op) \
+    ONCE(smt_merge_mutate_intention_freeze_in_destructor) \
+    ONCE(meta_in_keeper_create_metadata_failure) \
+    REGULAR(cache_warmer_stall) \
     REGULAR(check_table_query_delay_for_part) \
     REGULAR(dummy_failpoint) \
     REGULAR(prefetched_reader_pool_failpoint) \
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index d8ca1ab9e93..53da7901577 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -92,6 +92,8 @@
     M(LocalWriteThrottlerBytes, "Bytes passed through 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttler.") \
     M(LocalWriteThrottlerSleepMicroseconds, "Total time a query was sleeping to conform 'max_local_write_bandwidth_for_server'/'max_local_write_bandwidth' throttling.") \
     M(ThrottlerSleepMicroseconds, "Total time a query was sleeping to conform all throttling settings.") \
+    M(PartsWithAppliedMutationsOnFly, "Total number of parts for which there was any mutation applied on fly") \
+    M(MutationsAppliedOnFlyInAllParts, "The sum of number of applied mutations on-fly for part among all read parts") \
     \
     M(QueryMaskingRulesMatch, "Number of times query masking rules was successfully matched.") \
     \
@@ -311,6 +313,12 @@ The server successfully detected this situation and will download merged part fr
     M(ParallelReplicasProcessingPartsMicroseconds, "Time spent processing data parts") \
     M(ParallelReplicasStealingLeftoversMicroseconds, "Time spent collecting orphaned segments") \
     M(ParallelReplicasCollectingOwnedSegmentsMicroseconds, "Time spent collecting segments meant by hash") \
+    M(ParallelReplicasNumRequests, "Number of requests to the initiator.") \
+    M(ParallelReplicasDeniedRequests, "Number of completely denied requests to the initiator") \
+    M(CacheWarmerBytesDownloaded, "Amount of data fetched into filesystem cache by dedicated background threads.") \
+    M(CacheWarmerDataPartsDownloaded, "Number of data parts that were fully fetched by CacheWarmer.") \
+    M(IgnoredColdParts, "See setting ignore_cold_parts_seconds. Number of times read queries ignored very new parts that weren't pulled into cache by CacheWarmer yet.") \
+    M(PreferredWarmedUnmergedParts, "See setting prefer_warmed_unmerged_parts_seconds. Number of times read queries used outdated pre-merge parts that are in cache instead of merged part that wasn't pulled into cache by CacheWarmer yet.") \
     \
     M(PerfCPUCycles, "Total cycles. Be wary of what happens during CPU frequency scaling.")  \
     M(PerfInstructions, "Retired instructions. Be careful, these can be affected by various issues, most notably hardware interrupt counts.") \
@@ -516,6 +524,21 @@ The server successfully detected this situation and will download merged part fr
     M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.") \
     M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.") \
     \
+    M(MetadataFromKeeperCacheHit, "Number of times an object storage metadata request was answered from cache without making request to Keeper") \
+    M(MetadataFromKeeperCacheMiss, "Number of times an object storage metadata request had to be answered from Keeper") \
+    M(MetadataFromKeeperCacheUpdateMicroseconds, "Total time spent in updating the cache including waiting for responses from Keeper") \
+    M(MetadataFromKeeperUpdateCacheOneLevel, "Number of times a cache update for one level of directory tree was done") \
+    M(MetadataFromKeeperTransactionCommit, "Number of times metadata transaction commit was attempted") \
+    M(MetadataFromKeeperTransactionCommitRetry, "Number of times metadata transaction commit was retried") \
+    M(MetadataFromKeeperCleanupTransactionCommit, "Number of times metadata transaction commit for deleted objects cleanup was attempted") \
+    M(MetadataFromKeeperCleanupTransactionCommitRetry, "Number of times metadata transaction commit for deleted objects cleanup was retried") \
+    M(MetadataFromKeeperOperations, "Number of times a request was made to Keeper") \
+    M(MetadataFromKeeperIndividualOperations, "Number of paths read or written by single or multi requests to Keeper") \
+    M(MetadataFromKeeperReconnects, "Number of times a reconnect to Keeper was done") \
+    M(MetadataFromKeeperBackgroundCleanupObjects, "Number of times a old deleted object clean up was performed by background task") \
+    M(MetadataFromKeeperBackgroundCleanupTransactions, "Number of times old transaction idempotency token was cleaned up by background task") \
+    M(MetadataFromKeeperBackgroundCleanupErrors, "Number of times an error was encountered in background cleanup task") \
+    \
     M(KafkaRebalanceRevocations, "Number of partition revocations (the first stage of consumer group rebalance)") \
     M(KafkaRebalanceAssignments, "Number of partition assignments (the final stage of consumer group rebalance)") \
     M(KafkaRebalanceErrors, "Number of failed consumer group rebalances") \
@@ -607,9 +630,32 @@ The server successfully detected this situation and will download merged part fr
     M(MergeTreeAllRangesAnnouncementsSentElapsedMicroseconds, "Time spent in sending the announcement from the remote server to the initiator server about the set of data parts (for MergeTree tables). Measured on the remote server side.") \
     \
     M(ConnectionPoolIsFullMicroseconds, "Total time spent waiting for a slot in connection pool.") \
-    \
     M(AsyncLoaderWaitMicroseconds, "Total time a query was waiting for async loader jobs.") \
     \
+    M(DistrCacheServerSwitches, "Number of server switches between distributed cache servers in read/write-through cache") \
+    M(DistrCacheReadMicroseconds, "Time spent reading from distributed cache") \
+    M(DistrCacheFallbackReadMicroseconds, "Time spend reading from fallback buffer instead of distribted cache") \
+    M(DistrCachePrecomputeRangesMicroseconds, "Time spent to precompute read ranges") \
+    M(DistrCacheNextImplMicroseconds, "Time spend in ReadBufferFromDistributedCache::nextImpl") \
+    M(DistrCacheOpenedConnections, "The number of open connections to distributed cache") \
+    M(DistrCacheReusedConnections, "The number of reused connections to distributed cache") \
+    M(DistrCacheHoldConnections, "The number of used connections to distributed cache") \
+    \
+    M(DistrCacheGetResponseMicroseconds, "Time spend to wait for response from distributed cache") \
+    M(DistrCacheStartRangeMicroseconds, "Time spent to start a new read range with distributed cache") \
+    M(DistrCacheLockRegistryMicroseconds, "Time spent to take DistributedCacheRegistry lock") \
+    M(DistrCacheUnusedPackets, "Number of skipped unused packets from distributed cache") \
+    M(DistrCachePackets, "Total number of packets received from distributed cache") \
+    M(DistrCacheUnusedPacketsBytes, "The number of bytes in Data packets which were ignored") \
+    M(DistrCacheRegistryUpdateMicroseconds, "Time spent updating distributed cache registry") \
+    M(DistrCacheRegistryUpdates, "Number of distributed cache registry updates") \
+    \
+    M(DistrCacheConnectMicroseconds, "The time spent to connect to distributed cache") \
+    M(DistrCacheConnectAttempts, "The number of connection attempts to distributed cache") \
+    M(DistrCacheGetClient, "Number of client access times") \
+    \
+    M(DistrCacheServerProcessRequestMicroseconds, "Time spent processing request on DistributedCache server side") \
+    \
     M(LogTest, "Number of log messages with level Test") \
     M(LogTrace, "Number of log messages with level Trace") \
     M(LogDebug, "Number of log messages with level Debug") \
diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp
index 4dee6d905d9..0e7287c59ac 100644
--- a/src/Common/SystemLogBase.cpp
+++ b/src/Common/SystemLogBase.cpp
@@ -10,6 +10,7 @@
 #include <Interpreters/TextLog.h>
 #include <Interpreters/TraceLog.h>
 #include <Interpreters/FilesystemCacheLog.h>
+#include <Interpreters/DistributedCacheLog.h>
 #include <Interpreters/S3QueueLog.h>
 #include <Interpreters/FilesystemReadPrefetchesLog.h>
 #include <Interpreters/ProcessorsProfileLog.h>
diff --git a/src/Common/SystemLogBase.h b/src/Common/SystemLogBase.h
index a734c70f285..c509887cd28 100644
--- a/src/Common/SystemLogBase.h
+++ b/src/Common/SystemLogBase.h
@@ -29,6 +29,7 @@
     M(TextLogElement) \
     M(S3QueueLogElement) \
     M(FilesystemCacheLogElement) \
+    M(DistributedCacheLogElement) \
     M(FilesystemReadPrefetchesLogElement) \
     M(AsynchronousInsertLogElement) \
     M(BackupLogElement) \
diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp
index 05524a5d6b9..cf50d305e95 100644
--- a/src/Common/ThreadStatus.cpp
+++ b/src/Common/ThreadStatus.cpp
@@ -196,8 +196,9 @@ bool ThreadStatus::isQueryCanceled() const
     if (!thread_group)
         return false;
 
-    chassert(local_data.query_is_canceled_predicate);
-    return local_data.query_is_canceled_predicate();
+    if (local_data.query_is_canceled_predicate)
+        return local_data.query_is_canceled_predicate();
+    return false;
 }
 
 ThreadStatus::~ThreadStatus()
diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h
index 76cdfe9f230..04f53ead066 100644
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@@ -8,6 +8,7 @@
 #include <vector>
 #include <memory>
 #include <cstdint>
+#include <span>
 #include <functional>
 
 /** Generic interface for ZooKeeper-like services.
@@ -622,6 +623,10 @@ public:
         int32_t version,
         ReconfigCallback callback) = 0;
 
+    virtual void multi(
+        std::span<const RequestPtr> requests,
+        MultiCallback callback) = 0;
+
     virtual void multi(
         const Requests & requests,
         MultiCallback callback) = 0;
diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp
index a25329ad7c0..fce29a21e15 100644
--- a/src/Common/ZooKeeper/TestKeeper.cpp
+++ b/src/Common/ZooKeeper/TestKeeper.cpp
@@ -157,6 +157,10 @@ struct TestKeeperReconfigRequest final : ReconfigRequest, TestKeeperRequest
 struct TestKeeperMultiRequest final : MultiRequest, TestKeeperRequest
 {
     explicit TestKeeperMultiRequest(const Requests & generic_requests)
+        : TestKeeperMultiRequest(std::span(generic_requests))
+    {}
+
+    explicit TestKeeperMultiRequest(std::span<const RequestPtr> generic_requests)
     {
         requests.reserve(generic_requests.size());
 
@@ -883,6 +887,13 @@ void TestKeeper::reconfig(
 void TestKeeper::multi(
         const Requests & requests,
         MultiCallback callback)
+{
+    return multi(std::span(requests), std::move(callback));
+}
+
+void TestKeeper::multi(
+        std::span<const RequestPtr> requests,
+        MultiCallback callback)
 {
     TestKeeperMultiRequest request(requests);
 
diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h
index 36db5accff1..2774055652c 100644
--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@@ -101,6 +101,10 @@ public:
             const Requests & requests,
             MultiCallback callback) override;
 
+    void multi(
+            std::span<const RequestPtr> requests,
+            MultiCallback callback) override;
+
     void finalize(const String & reason) override;
 
     bool isFeatureEnabled(DB::KeeperFeatureFlag) const override
diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp
index 93568909041..ca0a211c716 100644
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@@ -1266,6 +1266,11 @@ std::future<Coordination::RemoveResponse> ZooKeeper::asyncTryRemoveNoThrow(const
 }
 
 std::future<Coordination::MultiResponse> ZooKeeper::asyncTryMultiNoThrow(const Coordination::Requests & ops)
+{
+    return asyncTryMultiNoThrow(std::span(ops));
+}
+
+std::future<Coordination::MultiResponse> ZooKeeper::asyncTryMultiNoThrow(std::span<const Coordination::RequestPtr> ops)
 {
     auto promise = std::make_shared<std::promise<Coordination::MultiResponse>>();
     auto future = promise->get_future();
diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h
index f1c333bb378..b2e159b0450 100644
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@@ -550,6 +550,7 @@ public:
     FutureMulti asyncMulti(const Coordination::Requests & ops);
     /// Like the previous one but don't throw any exceptions on future.get()
     FutureMulti asyncTryMultiNoThrow(const Coordination::Requests & ops);
+    FutureMulti asyncTryMultiNoThrow(std::span<const Coordination::RequestPtr> ops);
 
     using FutureSync = std::future<Coordination::SyncResponse>;
     FutureSync asyncSync(const std::string & path);
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index 660ae59e81e..4634eae7759 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -156,6 +156,12 @@ std::string ZooKeeperAuthRequest::toStringImpl() const
 
 void ZooKeeperCreateRequest::writeImpl(WriteBuffer & out) const
 {
+    /// See https://github.com/ClickHouse/clickhouse-private/issues/3029
+    if (path.starts_with("/clickhouse/tables/") && path.find("/parts/") != std::string::npos)
+    {
+        LOG_TRACE(getLogger(__PRETTY_FUNCTION__), "Creating part at path {}", path);
+    }
+
     Coordination::write(path, out);
     Coordination::write(data, out);
     Coordination::write(acls, out);
@@ -480,6 +486,10 @@ OpNum ZooKeeperMultiRequest::getOpNum() const
 }
 
 ZooKeeperMultiRequest::ZooKeeperMultiRequest(const Requests & generic_requests, const ACLs & default_acls)
+    : ZooKeeperMultiRequest(std::span{generic_requests}, default_acls)
+{}
+
+ZooKeeperMultiRequest::ZooKeeperMultiRequest(std::span<const Coordination::RequestPtr> generic_requests, const ACLs & default_acls)
 {
     /// Convert nested Requests to ZooKeeperRequests.
     /// Note that deep copy is required to avoid modifying path in presence of chroot prefix.
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h
index 5289be7a816..a1bd9b582e9 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@@ -7,17 +7,13 @@
 #include <boost/noncopyable.hpp>
 #include <IO/ReadBuffer.h>
 #include <IO/WriteBuffer.h>
-#include <map>
 #include <unordered_map>
-#include <mutex>
-#include <chrono>
 #include <vector>
 #include <memory>
-#include <thread>
-#include <atomic>
 #include <cstdint>
 #include <optional>
 #include <functional>
+#include <span>
 
 
 namespace Coordination
@@ -516,6 +512,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
     ZooKeeperMultiRequest() = default;
 
     ZooKeeperMultiRequest(const Requests & generic_requests, const ACLs & default_acls);
+    ZooKeeperMultiRequest(std::span<const Coordination::RequestPtr> generic_requests, const ACLs & default_acls);
 
     void writeImpl(WriteBuffer & out) const override;
     void readImpl(ReadBuffer & in) override;
diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
index 1fbadbd7616..8fd6e89dfd9 100644
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@@ -1454,6 +1454,13 @@ void ZooKeeper::reconfig(
 void ZooKeeper::multi(
     const Requests & requests,
     MultiCallback callback)
+{
+    multi(std::span(requests), std::move(callback));
+}
+
+void ZooKeeper::multi(
+    std::span<const RequestPtr> requests,
+    MultiCallback callback)
 {
     ZooKeeperMultiRequest request(requests, default_acls);
 
diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h
index b63f67bf7a6..d089ab7cc04 100644
--- a/src/Common/ZooKeeper/ZooKeeperImpl.h
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.h
@@ -194,6 +194,10 @@ public:
         int32_t version,
         ReconfigCallback callback) final;
 
+    void multi(
+        std::span<const RequestPtr> requests,
+        MultiCallback callback) override;
+
     void multi(
         const Requests & requests,
         MultiCallback callback) override;
diff --git a/src/Common/ZooKeeper/ZooKeeperRetries.h b/src/Common/ZooKeeper/ZooKeeperRetries.h
index ecef174c6c7..d411549346a 100644
--- a/src/Common/ZooKeeper/ZooKeeperRetries.h
+++ b/src/Common/ZooKeeper/ZooKeeperRetries.h
@@ -147,6 +147,11 @@ public:
         user_error = UserError{};
     }
 
+    void setKeeperError(const zkutil::KeeperException & exception)
+    {
+        setKeeperError(std::make_exception_ptr(exception), exception.code, exception.message());
+    }
+
     void stopRetries() { stop_retries = true; }
 
     bool isLastRetry() const { return total_failures >= retries_info.max_retries; }
@@ -180,6 +185,12 @@ private:
 
     bool canTry()
     {
+        if (unconditional_retry)
+        {
+            unconditional_retry = false;
+            return true;
+        }
+
         if (iteration_succeeded)
         {
             if (logger && total_failures > 0)
@@ -275,6 +286,10 @@ private:
 
     UInt64 current_iteration = 0;
     UInt64 current_backoff_ms = 0;
+
+public:
+    /// This is used in SharedMergeTree
+    bool unconditional_retry = false;
 };
 
 }

From 282c3b55f21e3b3da1cccf0570fd732097d98305 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 28 Feb 2024 23:47:33 +0100
Subject: [PATCH 15/66] Synchronize small pieces

---
 src/Coordination/KeeperSnapshotManagerS3.cpp    | 1 +
 src/Coordination/Standalone/Context.cpp         | 5 +++++
 src/Coordination/Standalone/Context.h           | 3 +++
 src/Core/MySQL/Authentication.cpp               | 5 +++++
 src/Core/SettingsEnums.cpp                      | 8 ++++++++
 src/Core/SettingsEnums.h                        | 4 ++++
 src/DataTypes/Serializations/ISerialization.cpp | 1 -
 7 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp
index 0337a564660..f2d861e8fd4 100644
--- a/src/Coordination/KeeperSnapshotManagerS3.cpp
+++ b/src/Coordination/KeeperSnapshotManagerS3.cpp
@@ -215,6 +215,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh
         }
 
         /// To avoid reference to binding
+
         const auto & snapshot_path_ref = snapshot_path;
 
         SCOPE_EXIT(
diff --git a/src/Coordination/Standalone/Context.cpp b/src/Coordination/Standalone/Context.cpp
index 374610769c4..264cf118501 100644
--- a/src/Coordination/Standalone/Context.cpp
+++ b/src/Coordination/Standalone/Context.cpp
@@ -382,4 +382,9 @@ std::shared_ptr<zkutil::ZooKeeper> Context::getZooKeeper() const
     throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot connect to ZooKeeper from Keeper");
 }
 
+const ServerSettings & Context::getServerSettings() const
+{
+    return shared->server_settings;
+}
+
 }
diff --git a/src/Coordination/Standalone/Context.h b/src/Coordination/Standalone/Context.h
index 49ad2b568fe..adb9111185f 100644
--- a/src/Coordination/Standalone/Context.h
+++ b/src/Coordination/Standalone/Context.h
@@ -11,6 +11,7 @@
 #include <Disks/IO/getThreadPoolReader.h>
 
 #include <Core/Settings.h>
+#include <Core/ServerSettings.h>
 #include <Core/BackgroundSchedulePool.h>
 
 #include <IO/AsyncReadCounters.h>
@@ -160,6 +161,8 @@ public:
     void updateKeeperConfiguration(const Poco::Util::AbstractConfiguration & config);
 
     zkutil::ZooKeeperPtr getZooKeeper() const;
+
+    const ServerSettings & getServerSettings() const;
 };
 
 }
diff --git a/src/Core/MySQL/Authentication.cpp b/src/Core/MySQL/Authentication.cpp
index ac6ed70dbb5..ac625e216cd 100644
--- a/src/Core/MySQL/Authentication.cpp
+++ b/src/Core/MySQL/Authentication.cpp
@@ -9,6 +9,11 @@
 #include <Common/OpenSSLHelpers.h>
 
 #include <base/scope_guard.h>
+#include <base/defines.h>
+#include <string_view>
+
+
+using namespace std::literals;
 
 namespace DB
 {
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index 04e1d0a18c8..ba41a4ed7e7 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -56,6 +56,14 @@ IMPLEMENT_SETTING_ENUM(OverflowMode, ErrorCodes::UNKNOWN_OVERFLOW_MODE,
     {{"throw", OverflowMode::THROW},
      {"break", OverflowMode::BREAK}})
 
+IMPLEMENT_SETTING_ENUM(DistributedCacheLogMode, ErrorCodes::BAD_ARGUMENTS,
+    {{"nothing", DistributedCacheLogMode::LOG_NOTHING},
+     {"on_error", DistributedCacheLogMode::LOG_ON_ERROR},
+     {"all", DistributedCacheLogMode::LOG_ALL}})
+
+IMPLEMENT_SETTING_ENUM(DistributedCachePoolBehaviourOnLimit, ErrorCodes::BAD_ARGUMENTS,
+    {{"wait", DistributedCachePoolBehaviourOnLimit::WAIT},
+     {"allocate_bypassing_pool", DistributedCachePoolBehaviourOnLimit::ALLOCATE_NEW_BYPASSING_POOL}});
 
 IMPLEMENT_SETTING_ENUM(OverflowModeGroupBy, ErrorCodes::UNKNOWN_OVERFLOW_MODE,
     {{"throw", OverflowMode::THROW},
diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index 691eefbd4e6..db9842aaf86 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -141,6 +141,10 @@ enum class DefaultTableEngine
 
 DECLARE_SETTING_ENUM(DefaultTableEngine)
 
+DECLARE_SETTING_ENUM(DistributedCacheLogMode)
+
+DECLARE_SETTING_ENUM(DistributedCachePoolBehaviourOnLimit)
+
 enum class CleanDeletedRows
 {
     Never = 0, /// Disable.
diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp
index 7d57d72090b..a3a28f8091c 100644
--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@@ -417,4 +417,3 @@ void ISerialization::throwUnexpectedDataAfterParsedValue(IColumn & column, ReadB
 }
 
 }
-

From 704b32fdcd7d0cb6171d8b6fdcb6d440e4f5f4ac Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 28 Feb 2024 23:49:35 +0100
Subject: [PATCH 16/66] Fix build

---
 src/Core/SettingsEnums.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index db9842aaf86..0d0138e6246 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -141,9 +141,6 @@ enum class DefaultTableEngine
 
 DECLARE_SETTING_ENUM(DefaultTableEngine)
 
-DECLARE_SETTING_ENUM(DistributedCacheLogMode)
-
-DECLARE_SETTING_ENUM(DistributedCachePoolBehaviourOnLimit)
 
 enum class CleanDeletedRows
 {

From 9e826bb11ce9482dfa1cc1984618553b0a682c72 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 28 Feb 2024 23:50:15 +0100
Subject: [PATCH 17/66] Fix build

---
 src/Common/SystemLogBase.cpp | 1 -
 src/Core/SettingsEnums.cpp   | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp
index 0e7287c59ac..4dee6d905d9 100644
--- a/src/Common/SystemLogBase.cpp
+++ b/src/Common/SystemLogBase.cpp
@@ -10,7 +10,6 @@
 #include <Interpreters/TextLog.h>
 #include <Interpreters/TraceLog.h>
 #include <Interpreters/FilesystemCacheLog.h>
-#include <Interpreters/DistributedCacheLog.h>
 #include <Interpreters/S3QueueLog.h>
 #include <Interpreters/FilesystemReadPrefetchesLog.h>
 #include <Interpreters/ProcessorsProfileLog.h>
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index ba41a4ed7e7..64b10e52a85 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -56,11 +56,6 @@ IMPLEMENT_SETTING_ENUM(OverflowMode, ErrorCodes::UNKNOWN_OVERFLOW_MODE,
     {{"throw", OverflowMode::THROW},
      {"break", OverflowMode::BREAK}})
 
-IMPLEMENT_SETTING_ENUM(DistributedCacheLogMode, ErrorCodes::BAD_ARGUMENTS,
-    {{"nothing", DistributedCacheLogMode::LOG_NOTHING},
-     {"on_error", DistributedCacheLogMode::LOG_ON_ERROR},
-     {"all", DistributedCacheLogMode::LOG_ALL}})
-
 IMPLEMENT_SETTING_ENUM(DistributedCachePoolBehaviourOnLimit, ErrorCodes::BAD_ARGUMENTS,
     {{"wait", DistributedCachePoolBehaviourOnLimit::WAIT},
      {"allocate_bypassing_pool", DistributedCachePoolBehaviourOnLimit::ALLOCATE_NEW_BYPASSING_POOL}});

From c6d8cf2afabd99c26881816dd283fddd42b4171c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 28 Feb 2024 23:51:31 +0100
Subject: [PATCH 18/66] Fix build

---
 src/Common/SystemLogBase.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp
index 4dee6d905d9..aef4e19a70c 100644
--- a/src/Common/SystemLogBase.cpp
+++ b/src/Common/SystemLogBase.cpp
@@ -260,10 +260,4 @@ void SystemLogBase<LogElement>::add(LogElement element)
 template <typename LogElement>
 void SystemLogBase<LogElement>::notifyFlush(bool force) { queue->notifyFlush(force); }
 
-#define INSTANTIATE_SYSTEM_LOG_BASE(ELEMENT) template class SystemLogBase<ELEMENT>;
-SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG_BASE)
-
-#define INSTANTIATE_SYSTEM_LOG_QUEUE(ELEMENT) template class SystemLogQueue<ELEMENT>;
-SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG_QUEUE)
-
 }

From a46cb36368a5595e84937e24c427afc93756a54b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 28 Feb 2024 23:51:48 +0100
Subject: [PATCH 19/66] Fix build

---
 src/Core/SettingsEnums.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index 64b10e52a85..04e1d0a18c8 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -56,9 +56,6 @@ IMPLEMENT_SETTING_ENUM(OverflowMode, ErrorCodes::UNKNOWN_OVERFLOW_MODE,
     {{"throw", OverflowMode::THROW},
      {"break", OverflowMode::BREAK}})
 
-IMPLEMENT_SETTING_ENUM(DistributedCachePoolBehaviourOnLimit, ErrorCodes::BAD_ARGUMENTS,
-    {{"wait", DistributedCachePoolBehaviourOnLimit::WAIT},
-     {"allocate_bypassing_pool", DistributedCachePoolBehaviourOnLimit::ALLOCATE_NEW_BYPASSING_POOL}});
 
 IMPLEMENT_SETTING_ENUM(OverflowModeGroupBy, ErrorCodes::UNKNOWN_OVERFLOW_MODE,
     {{"throw", OverflowMode::THROW},

From 42437a2ae14c142ee629021c71fac6ea6107defc Mon Sep 17 00:00:00 2001
From: Michael Kolupaev <michael.kolupaev@clickhouse.com>
Date: Thu, 29 Feb 2024 03:27:32 -0800
Subject: [PATCH 20/66] Userspace page cache (#53770)

* Userspace page cache

* Maybe it'll build this time, who knows.

* 'auto' went out of fashion, I guess

* Documentation, tsan workaround, metric 'UnreclaimableRss', disable page cache in the test that uses DatabaseOrdinary

* Moved CachedInMemoryReadBufferFromFile to object store level, changed settings, addressed other comments.

* Fix

* Another fix

* Fix restricted seek, fix ppc64le build

* Don't allow page cache with file cache

* Adjust tests a little

* Fix clang-tidy

* Conflicts

* Comments

* Maybe unbroke AsynchronousBoundedReadBuffer

* SettingsChangesHistory.h

* Fix warning in test
---
 docs/en/operations/storing-data.md            |  10 +
 .../example-datasets/opensky.mdx              |  12 +-
 programs/server/Server.cpp                    |   7 +
 src/Access/Common/AccessType.h                |   1 +
 src/Common/PageCache.cpp                      | 688 ++++++++++++++++++
 src/Common/PageCache.h                        | 299 ++++++++
 src/Common/ProfileEvents.cpp                  |   9 +
 src/Core/Defines.h                            |   9 +
 src/Core/ServerSettings.h                     |   7 +-
 src/Core/Settings.h                           |   4 +
 src/Core/SettingsChangesHistory.h             |   3 +
 .../IO/AsynchronousBoundedReadBuffer.cpp      |  12 +-
 .../IO/CachedOnDiskReadBufferFromFile.cpp     |   2 +-
 src/Disks/IO/ReadBufferFromRemoteFSGather.cpp |  67 +-
 src/Disks/IO/ReadBufferFromRemoteFSGather.h   |   9 +-
 src/Disks/IO/ThreadPoolRemoteFSReader.cpp     |   2 +
 src/Disks/IO/ThreadPoolRemoteFSReader.h       |   3 +
 .../AzureBlobStorage/AzureObjectStorage.cpp   |   8 +-
 .../ObjectStorages/DiskObjectStorage.cpp      |   3 +-
 .../ObjectStorages/HDFS/HDFSObjectStorage.cpp |   4 +-
 .../Local/LocalObjectStorage.cpp              |   6 +-
 .../ObjectStorages/S3/S3ObjectStorage.cpp     |   9 +-
 .../ObjectStorages/Web/WebObjectStorage.cpp   |   7 +-
 src/IO/AsynchronousReader.h                   |   3 +
 src/IO/BufferBase.h                           |   3 +
 src/IO/CachedInMemoryReadBufferFromFile.cpp   | 188 +++++
 src/IO/CachedInMemoryReadBufferFromFile.h     |  41 ++
 src/IO/ReadBuffer.h                           |  19 +-
 src/IO/ReadSettings.h                         |   7 +
 src/Interpreters/Context.cpp                  |  41 +-
 src/Interpreters/Context.h                    |   5 +
 src/Interpreters/InterpreterSystemQuery.cpp   |   9 +
 .../ServerAsynchronousMetrics.cpp             |  12 +
 src/Interpreters/tests/gtest_page_cache.cpp   | 267 +++++++
 src/Parsers/ASTSystemQuery.h                  |   1 +
 src/Storages/MergeTree/IMergeTreeDataPart.cpp |   4 -
 src/Storages/StorageS3.cpp                    |  18 +-
 tests/clickhouse-test                         |   2 +
 .../01271_show_privileges.reference           |   1 +
 .../0_stateless/02867_page_cache.reference    |  23 +
 .../queries/0_stateless/02867_page_cache.sql  | 105 +++
 41 files changed, 1854 insertions(+), 76 deletions(-)
 create mode 100644 src/Common/PageCache.cpp
 create mode 100644 src/Common/PageCache.h
 create mode 100644 src/IO/CachedInMemoryReadBufferFromFile.cpp
 create mode 100644 src/IO/CachedInMemoryReadBufferFromFile.h
 create mode 100644 src/Interpreters/tests/gtest_page_cache.cpp
 create mode 100644 tests/queries/0_stateless/02867_page_cache.reference
 create mode 100644 tests/queries/0_stateless/02867_page_cache.sql

diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index 003277c8d4f..84251812c01 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -275,6 +275,16 @@ Cache profile events:
 
 - `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
 
+## Using in-memory cache (userspace page cache) {#userspace-page-cache}
+
+The File Cache described above stores cached data in local files. Alternatively, object-store-based disks can be configured to use "Userspace Page Cache", which is RAM-only. Userspace page cache is recommended only if file cache can't be used for some reason, e.g. if the machine doesn't have a local disk at all. Note that file cache effectively uses RAM for caching too, since the OS caches contents of local files.
+
+To enable userspace page cache for disks that don't use file cache, use setting `use_page_cache_for_disks_without_file_cache`.
+
+By default, on Linux, the userspace page cache will use all available memory, similar to the OS page cache. In tools like `top` and `ps`, the clickhouse server process will typically show resident set size near 100% of the machine's RAM - this is normal, and most of this memory is actually reclaimable by the OS on memory pressure (`MADV_FREE`). This behavior can be disabled with server setting `page_cache_use_madv_free = 0`, making the userspace page cache just use a fixed amount of memory `page_cache_size` with no special interaction with the OS. On Mac OS, `page_cache_use_madv_free` is always disabled as it doesn't have lazy `MADV_FREE`.
+
+Unfortunately, `page_cache_use_madv_free` makes it difficult to tell if the server is close to running out of memory, since the RSS metric becomes useless. Async metric `UnreclaimableRSS` shows the amount of physical memory used by the server, excluding the memory reclaimable by the OS: `select value from system.asynchronous_metrics where metric = 'UnreclaimableRSS'`. Use it for monitoring instead of RSS. This metric is only available if `page_cache_use_madv_free` is enabled.
+
 ## Storing Data on Web Server {#storing-data-on-webserver}
 
 There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
diff --git a/docs/zh/getting-started/example-datasets/opensky.mdx b/docs/zh/getting-started/example-datasets/opensky.mdx
index 92cd104e06e..b79c02ab780 100644
--- a/docs/zh/getting-started/example-datasets/opensky.mdx
+++ b/docs/zh/getting-started/example-datasets/opensky.mdx
@@ -1,4 +1,4 @@
---- 
+---
 slug: /zh/getting-started/example-datasets/opensky
 sidebar_label: 空中交通数据
 description: 该数据集中的数据是从完整的 OpenSky 数据集中衍生而来的，对其中的数据进行了必要的清理，用以展示在 COVID-19 期间空中交通的发展。
@@ -53,12 +53,12 @@ CREATE TABLE opensky
 ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"'
 ```
 
-- 这里我们将文件列表（`ls -1 flightlist_*.csv.gz`）传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序，但由于我们只有 30 个文件，工作程序的数量将只有 30 个。 
-- 对于每个文件，`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符，然后 `xargs` 由命令进行填充（使用 `-I{}`）。 
-- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出（`-c` 参数），并将输出重定向到 `clickhouse-client`。 
-- 我们还要求使用扩展解析器解析 [DateTime](../../sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](../../operations/settings/ settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。
+- 这里我们将文件列表（`ls -1 flightlist_*.csv.gz`）传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序，但由于我们只有 30 个文件，工作程序的数量将只有 30 个。
+- 对于每个文件，`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符，然后 `xargs` 由命令进行填充（使用 `-I{}`）。
+- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出（`-c` 参数），并将输出重定向到 `clickhouse-client`。
+- 我们还要求使用扩展解析器解析 [DateTime](/docs/zh/sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](/docs/zh/operations/settings/settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。
 
-最后，`clickhouse-client` 会以 [CSVWithNames](../../interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。 
+最后，`clickhouse-client` 会以 [CSVWithNames](/docs/zh/interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。
 
 并行导入需要 24 秒。
 
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 6dc33042a05..786cb27d8c4 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1228,6 +1228,13 @@ try
     }
     global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio);
 
+    size_t page_cache_size = server_settings.page_cache_size;
+    if (page_cache_size != 0)
+        global_context->setPageCache(
+            server_settings.page_cache_chunk_size, server_settings.page_cache_mmap_size,
+            page_cache_size, server_settings.page_cache_use_madv_free,
+            server_settings.page_cache_use_transparent_huge_pages);
+
     String index_uncompressed_cache_policy = server_settings.index_uncompressed_cache_policy;
     size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size;
     double index_uncompressed_cache_size_ratio = server_settings.index_uncompressed_cache_size_ratio;
diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h
index 8172a468f89..de3eda96bac 100644
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@@ -162,6 +162,7 @@ enum class AccessType
     M(SYSTEM_DROP_COMPILED_EXPRESSION_CACHE, "SYSTEM DROP COMPILED EXPRESSION, DROP COMPILED EXPRESSION CACHE, DROP COMPILED EXPRESSIONS", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_FILESYSTEM_CACHE, "SYSTEM DROP FILESYSTEM CACHE, DROP FILESYSTEM CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_SYNC_FILESYSTEM_CACHE, "SYSTEM REPAIR FILESYSTEM CACHE, REPAIR FILESYSTEM CACHE, SYNC FILESYSTEM CACHE", GLOBAL, SYSTEM) \
+    M(SYSTEM_DROP_PAGE_CACHE, "SYSTEM DROP PAGE CACHE, DROP PAGE CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
     M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \
diff --git a/src/Common/PageCache.cpp b/src/Common/PageCache.cpp
new file mode 100644
index 00000000000..511ec23d431
--- /dev/null
+++ b/src/Common/PageCache.cpp
@@ -0,0 +1,688 @@
+#include "PageCache.h"
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <Common/logger_useful.h>
+#include <Common/formatReadable.h>
+#include <Common/ProfileEvents.h>
+#include <Common/SipHash.h>
+#include <base/hex.h>
+#include <base/errnoToString.h>
+#include <base/getPageSize.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/ReadHelpers.h>
+
+namespace ProfileEvents
+{
+    extern const Event PageCacheChunkMisses;
+    extern const Event PageCacheChunkShared;
+    extern const Event PageCacheChunkDataHits;
+    extern const Event PageCacheChunkDataPartialHits;
+    extern const Event PageCacheChunkDataMisses;
+    extern const Event PageCacheBytesUnpinnedRoundedToPages;
+    extern const Event PageCacheBytesUnpinnedRoundedToHugePages;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int SYSTEM_ERROR;
+    extern const int MEMORY_LIMIT_EXCEEDED;
+    extern const int CANNOT_ALLOCATE_MEMORY;
+    extern const int INVALID_SETTING_VALUE;
+    extern const int FILE_DOESNT_EXIST;
+}
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunknown-warning-option"
+#pragma clang diagnostic ignored "-Wreadability-make-member-function-const"
+
+PinnedPageChunk::PinnedPageChunk(PinnedPageChunk && c) noexcept
+    : cache(std::exchange(c.cache, nullptr)), chunk(std::exchange(c.chunk, nullptr)) {}
+
+PinnedPageChunk & PinnedPageChunk::operator=(PinnedPageChunk && c) noexcept
+{
+    if (cache)
+        cache->removeRef(chunk);
+    cache = std::exchange(c.cache, nullptr);
+    chunk = std::exchange(c.chunk, nullptr);
+    return *this;
+}
+
+PinnedPageChunk::~PinnedPageChunk() noexcept
+{
+    if (cache)
+        cache->removeRef(chunk);
+}
+
+PinnedPageChunk::PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept : cache(cache_), chunk(chunk_) {}
+
+const PageChunk * PinnedPageChunk::getChunk() const { return chunk; }
+
+bool PinnedPageChunk::markPagePopulated(size_t page_idx)
+{
+    bool r = chunk->pages_populated.set(page_idx);
+    return r;
+}
+
+void PinnedPageChunk::markPrefixPopulated(size_t bytes)
+{
+    for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i)
+        markPagePopulated(i);
+}
+
+bool PinnedPageChunk::isPrefixPopulated(size_t bytes) const
+{
+    for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i)
+        if (!chunk->pages_populated.get(i))
+            return false;
+    return true;
+}
+
+AtomicBitSet::AtomicBitSet() = default;
+
+void AtomicBitSet::init(size_t nn)
+{
+    n = nn;
+    v = std::make_unique<std::atomic<UInt8>[]>((n + 7) / 8);
+}
+
+bool AtomicBitSet::get(size_t i) const
+{
+    return (v[i / 8] & (1 << (i % 8))) != 0;
+}
+
+bool AtomicBitSet::any() const
+{
+    for (size_t i = 0; i < (n + 7) / 8; ++i)
+        if (v[i])
+            return true;
+    return false;
+}
+
+bool AtomicBitSet::set(size_t i) const
+{
+    UInt8 prev = v[i / 8].fetch_or(1 << (i % 8));
+    return (prev & (1 << (i % 8))) == 0;
+}
+
+bool AtomicBitSet::set(size_t i, bool val) const
+{
+    if (val)
+        return set(i);
+    else
+        return unset(i);
+}
+
+bool AtomicBitSet::unset(size_t i) const
+{
+    UInt8 prev = v[i / 8].fetch_and(~(1 << (i % 8)));
+    return (prev & (1 << (i % 8))) != 0;
+}
+
+void AtomicBitSet::unsetAll() const
+{
+    for (size_t i = 0; i < (n + 7) / 8; ++i)
+        v[i].store(0, std::memory_order_relaxed);
+}
+
+PageCache::PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free_, bool use_huge_pages_)
+    : bytes_per_page(getPageSize())
+    , use_madv_free(use_madv_free_)
+    , use_huge_pages(use_huge_pages_)
+    , rng(randomSeed())
+{
+    if (bytes_per_chunk == 0 || bytes_per_mmap == 0)
+        throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Userspace page cache chunk size and mmap size can't be zero.");
+
+    if (use_huge_pages)
+    {
+        use_huge_pages = false;
+        bool print_warning = false;
+#ifdef OS_LINUX
+        try
+        {
+            ReadBufferFromFile in("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size");
+            size_t huge_page_size;
+            readIntText(huge_page_size, in);
+
+            if (huge_page_size == 0 || huge_page_size % bytes_per_page != 0)
+                throw Exception(ErrorCodes::SYSTEM_ERROR, "Invalid huge page size reported by the OS: {}", huge_page_size);
+
+            /// THP can be configured to be 2 MiB or 1 GiB in size. 1 GiB is way too big for us.
+            if (huge_page_size <= (16 << 20))
+            {
+                pages_per_big_page = huge_page_size / bytes_per_page;
+                use_huge_pages = true;
+            }
+            else
+            {
+                LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS huge page size is too large for our purposes: {} KiB. Using regular pages. Userspace page cache will be relatively slow.", huge_page_size);
+            }
+        }
+        catch (Exception & e)
+        {
+            if (e.code() != ErrorCodes::FILE_DOESNT_EXIST)
+                throw;
+            print_warning = true;
+        }
+#else
+        print_warning = true;
+#endif
+        if (print_warning)
+            LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS doesn't support transparent huge pages. Userspace page cache will be relatively slow.");
+    }
+
+    pages_per_chunk = ((bytes_per_chunk - 1) / (bytes_per_page * pages_per_big_page) + 1) * pages_per_big_page;
+    chunks_per_mmap_target = (bytes_per_mmap - 1) / (bytes_per_page * pages_per_chunk) + 1;
+    max_mmaps = (bytes_total - 1) / (bytes_per_page * pages_per_chunk * chunks_per_mmap_target) + 1;
+}
+
+PageCache::~PageCache()
+{
+    chassert(getPinnedSize() == 0);
+}
+
+size_t PageCache::pageSize() const { return bytes_per_page; }
+size_t PageCache::chunkSize() const { return bytes_per_page * pages_per_chunk; }
+size_t PageCache::maxChunks() const { return chunks_per_mmap_target * max_mmaps; }
+
+size_t PageCache::getPinnedSize() const
+{
+    std::unique_lock lock(global_mutex);
+    return (total_chunks - lru.size()) * bytes_per_page * pages_per_chunk;
+}
+
+PageCache::MemoryStats PageCache::getResidentSetSize() const
+{
+    MemoryStats stats;
+#ifdef OS_LINUX
+    if (use_madv_free)
+    {
+        std::unordered_set<UInt64> cache_mmap_addrs;
+        for (const auto & m : mmaps)
+            cache_mmap_addrs.insert(reinterpret_cast<UInt64>(m.ptr));
+
+        ReadBufferFromFile in("/proc/self/smaps");
+
+        /// Parse the smaps contents, which is text consisting of entries like this:
+        ///
+        /// 117ba4a00000-117be4a00000 rw-p 00000000 00:00 0
+        /// Size:            1048576 kB
+        /// KernelPageSize:        4 kB
+        /// MMUPageSize:           4 kB
+        /// Rss:              539516 kB
+        /// Pss:              539516 kB
+        /// ...
+
+        auto read_token = [&]
+        {
+            String res;
+            while (!in.eof())
+            {
+                char c = *in.position();
+                if (c == '\n' || c == '\t' || c == ' ' || c == '-')
+                    break;
+                res += c;
+                ++in.position();
+            }
+            return res;
+        };
+
+        auto skip_whitespace = [&]
+        {
+            while (!in.eof())
+            {
+                char c = *in.position();
+                if (c != ' ' && c != '\t')
+                    break;
+                ++in.position();
+            }
+        };
+
+        bool current_range_is_cache = false;
+        size_t total_rss = 0;
+        size_t total_lazy_free = 0;
+        while (!in.eof())
+        {
+            String s = read_token();
+            if (!in.eof() && *in.position() == '-')
+            {
+                if (s.size() < 16)
+                    s.insert(0, 16 - s.size(), '0');
+                UInt64 addr = unhexUInt<UInt64>(s.c_str());
+                current_range_is_cache = cache_mmap_addrs.contains(addr);
+            }
+            else if (s == "Rss:" || s == "LazyFree")
+            {
+                skip_whitespace();
+                size_t val;
+                readIntText(val, in);
+                skip_whitespace();
+                String unit = read_token();
+                if (unit != "kB")
+                    throw Exception(ErrorCodes::SYSTEM_ERROR, "Unexpected units in /proc/self/smaps: {}", unit);
+                size_t bytes = val * 1024;
+
+                if (s == "Rss:")
+                {
+                    total_rss += bytes;
+                    if (current_range_is_cache)
+                        stats.page_cache_rss += bytes;
+                }
+                else
+                    total_lazy_free += bytes;
+            }
+            skipToNextLineOrEOF(in);
+        }
+        stats.unreclaimable_rss = total_rss - std::min(total_lazy_free, total_rss);
+
+        return stats;
+    }
+#endif
+
+    stats.page_cache_rss = bytes_per_page * pages_per_chunk * total_chunks;
+    return stats;
+}
+
+PinnedPageChunk PageCache::getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction)
+{
+    PageChunk * chunk;
+    /// Make sure we increment exactly one of the counters about the fate of a chunk lookup.
+    bool incremented_profile_events = false;
+
+    {
+        std::unique_lock lock(global_mutex);
+
+        auto * it = chunk_by_key.find(key);
+        if (it == chunk_by_key.end())
+        {
+            chunk = getFreeChunk(lock);
+            chassert(!chunk->key.has_value());
+
+            if (!detached_if_missing)
+            {
+                chunk->key = key;
+                chunk_by_key.insert({key, chunk});
+            }
+
+            ProfileEvents::increment(ProfileEvents::PageCacheChunkMisses);
+            incremented_profile_events = true;
+        }
+        else
+        {
+            chunk = it->getMapped();
+            size_t prev_pin_count = chunk->pin_count.fetch_add(1);
+
+            if (prev_pin_count == 0)
+            {
+                /// Not eligible for LRU eviction while pinned.
+                chassert(chunk->is_linked());
+                lru.erase(lru.iterator_to(*chunk));
+
+                if (detached_if_missing)
+                {
+                    /// Peek the first page to see if it's evicted.
+                    /// (Why not use the full probing procedure instead, restoreChunkFromLimbo()?
+                    ///  Right here we can't do it because of how the two mutexes are organized.
+                    ///  And we want to do the check+detach before unlocking global_mutex, because
+                    ///  otherwise we may detach a chunk pinned by someone else, which may be unexpected
+                    ///  for that someone else. Or maybe the latter is fine, dropCache() already does it.)
+                    if (chunk->pages_populated.get(0) && reinterpret_cast<volatile std::atomic<char>*>(chunk->data)->load(std::memory_order_relaxed) == 0)
+                        evictChunk(chunk, lock);
+                }
+
+                if (inject_eviction && chunk->key.has_value() && rng() % 10 == 0)
+                {
+                    /// Simulate eviction of the chunk or some of its pages.
+                    if (rng() % 2 == 0)
+                        evictChunk(chunk, lock);
+                    else
+                        for (size_t i = 0; i < 20; ++i)
+                            chunk->pages_populated.unset(rng() % (chunk->size / chunk->page_size));
+                }
+            }
+            else
+            {
+                ProfileEvents::increment(ProfileEvents::PageCacheChunkShared);
+                incremented_profile_events = true;
+            }
+        }
+    }
+
+    {
+        std::unique_lock chunk_lock(chunk->chunk_mutex);
+
+        if (chunk->pages_state == PageChunkState::Limbo)
+        {
+            auto [pages_restored, pages_evicted] = restoreChunkFromLimbo(chunk, chunk_lock);
+            chunk->pages_state = PageChunkState::Stable;
+
+            if (!incremented_profile_events)
+            {
+                if (pages_evicted == 0)
+                    ProfileEvents::increment(ProfileEvents::PageCacheChunkDataHits);
+                else if (pages_evicted < pages_restored)
+                    ProfileEvents::increment(ProfileEvents::PageCacheChunkDataPartialHits);
+                else
+                    ProfileEvents::increment(ProfileEvents::PageCacheChunkDataMisses);
+            }
+        }
+    }
+
+    return PinnedPageChunk(this, chunk);
+}
+
+void PageCache::removeRef(PageChunk * chunk) noexcept
+{
+    /// Fast path if this is not the last reference.
+    size_t prev_pin_count = chunk->pin_count.load();
+    if (prev_pin_count > 1 && chunk->pin_count.compare_exchange_strong(prev_pin_count, prev_pin_count - 1))
+        return;
+
+    {
+        std::unique_lock lock(global_mutex);
+
+        prev_pin_count = chunk->pin_count.fetch_sub(1);
+        if (prev_pin_count > 1)
+            return;
+
+        chassert(!chunk->is_linked());
+        if (chunk->key.has_value())
+            lru.push_back(*chunk);
+        else
+            /// Unpinning detached chunk. We'd rather reuse it soon, so put it at the front.
+            lru.push_front(*chunk);
+    }
+
+    {
+        std::unique_lock chunk_lock(chunk->chunk_mutex);
+
+        /// Need to be extra careful here because we unlocked global_mutex above, so other
+        /// getOrSet()/removeRef() calls could have happened during this brief period.
+        if (use_madv_free && chunk->pages_state == PageChunkState::Stable && chunk->pin_count.load() == 0)
+        {
+            sendChunkToLimbo(chunk, chunk_lock);
+            chunk->pages_state = PageChunkState::Limbo;
+        }
+    }
+}
+
+static void logUnexpectedSyscallError(std::string name)
+{
+    std::string message = fmt::format("{} failed: {}", name, errnoToString());
+    LOG_WARNING(&Poco::Logger::get("PageCache"), "{}", message);
+#if defined(ABORT_ON_LOGICAL_ERROR)
+    volatile bool true_ = true;
+    if (true_) // suppress warning about missing [[noreturn]]
+        abortOnFailedAssertion(message);
+#endif
+}
+
+void PageCache::sendChunkToLimbo(PageChunk * chunk [[maybe_unused]], std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept
+{
+#ifdef MADV_FREE // if we're not on a very old version of Linux
+    chassert(chunk->size == bytes_per_page * pages_per_chunk);
+    size_t populated_pages = 0;
+    size_t populated_big_pages = 0;
+    for (size_t big_page_idx = 0; big_page_idx < pages_per_chunk / pages_per_big_page; ++big_page_idx)
+    {
+        bool big_page_populated = false;
+        for (size_t sub_idx = 0; sub_idx < pages_per_big_page; ++sub_idx)
+        {
+            size_t idx = big_page_idx * pages_per_big_page + sub_idx;
+            if (!chunk->pages_populated.get(idx))
+                continue;
+            big_page_populated = true;
+            populated_pages += 1;
+
+            auto & byte = reinterpret_cast<volatile std::atomic<char> &>(chunk->data[idx * bytes_per_page]);
+            chunk->first_bit_of_each_page.set(idx, (byte.load(std::memory_order_relaxed) & 1) != 0);
+            byte.fetch_or(1, std::memory_order_relaxed);
+        }
+        if (big_page_populated)
+            populated_big_pages += 1;
+    }
+    int r = madvise(chunk->data, chunk->size, MADV_FREE);
+    if (r != 0)
+        logUnexpectedSyscallError("madvise(MADV_FREE)");
+
+    ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToPages, bytes_per_page * populated_pages);
+    ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToHugePages, bytes_per_page * pages_per_big_page * populated_big_pages);
+#endif
+}
+
+std::pair<size_t, size_t> PageCache::restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept
+{
+    static_assert(sizeof(std::atomic<char>) == 1, "char is not atomic?");
+    // Make sure our strategic memory reads/writes are not reordered or optimized out.
+    auto * data = reinterpret_cast<volatile std::atomic<char> *>(chunk->data);
+    size_t pages_restored = 0;
+    size_t pages_evicted = 0;
+    for (size_t idx = 0; idx < chunk->size / bytes_per_page; ++idx)
+    {
+        if (!chunk->pages_populated.get(idx))
+            continue;
+
+        /// After MADV_FREE, it's guaranteed that:
+        ///  * writing to the page makes it non-freeable again (reading doesn't),
+        ///  * after the write, the page contents are either fully intact or fully zero-filled,
+        ///  * even before the write, reads return either intact data (if the page wasn't freed) or zeroes (if it was, and the read page-faulted).
+        /// (And when doing the write there's no way to tell whether it page-faulted or not, AFAICT; that would make our life much easier!)
+        ///
+        /// With that in mind, we do the following dance to bring the page back from the MADV_FREE limbo:
+        ///  0. [in advance] Before doing MADV_FREE, make sure the page's first byte is not zero.
+        ///     We do it by setting the lowest bit of the first byte to 1, after saving the original value of that bit into a bitset.
+        ///  1. Read the second byte.
+        ///  2. Write the second byte back. This makes the page non-freeable.
+        ///  3. Read the first byte.
+        ///    3a. If it's zero, the page was freed.
+        ///        Set the second byte to 0, to keep the buffer zero-filled if the page was freed
+        ///        between steps 1 and 2.
+        ///    3b. If it's nonzero, the page is intact.
+        ///        Restore the lowest bit of the first byte to the saved original value from the bitset.
+
+        char second_byte = data[idx * bytes_per_page + 1].load(std::memory_order_relaxed);
+        data[idx * bytes_per_page + 1].store(second_byte, std::memory_order_relaxed);
+
+        char first_byte = data[idx * bytes_per_page].load(std::memory_order_relaxed);
+        if (first_byte == 0)
+        {
+            pages_evicted += 1;
+            data[idx * bytes_per_page + 1].store(0, std::memory_order_relaxed);
+            chunk->pages_populated.unset(idx);
+        }
+        else
+        {
+            pages_restored += 1;
+            chassert(first_byte & 1);
+            if (!chunk->first_bit_of_each_page.get(idx))
+                data[idx * bytes_per_page].fetch_and(~1, std::memory_order_relaxed);
+        }
+    }
+    return {pages_restored, pages_evicted};
+}
+
+PageChunk * PageCache::getFreeChunk(std::unique_lock<std::mutex> & lock /* global_mutex */)
+{
+    if (lru.empty() || (mmaps.size() < max_mmaps && lru.front().key.has_value()))
+        addMmap(lock);
+    if (lru.empty())
+        throw Exception(ErrorCodes::MEMORY_LIMIT_EXCEEDED, "All chunks in the entire page cache ({:.3} GiB) are pinned.",
+            bytes_per_page * pages_per_chunk * total_chunks * 1. / (1l << 30));
+
+    PageChunk * chunk = &lru.front();
+    lru.erase(lru.iterator_to(*chunk));
+
+    size_t prev_pin_count = chunk->pin_count.fetch_add(1);
+    chassert(prev_pin_count == 0);
+
+    evictChunk(chunk, lock);
+
+    return chunk;
+}
+
+void PageCache::evictChunk(PageChunk * chunk, std::unique_lock<std::mutex> & /* global_mutex */)
+{
+    if (chunk->key.has_value())
+    {
+        size_t erased = chunk_by_key.erase(chunk->key.value());
+        chassert(erased);
+        chunk->key.reset();
+    }
+
+    chunk->state.reset();
+
+    /// This is tricky. We're not holding the chunk_mutex, so another thread might be running
+    /// sendChunkToLimbo() or even restoreChunkFromLimbo() on this chunk right now.
+    ///
+    /// Nevertheless, it's correct and sufficient to clear pages_populated here because sendChunkToLimbo()
+    /// and restoreChunkFromLimbo() only touch pages_populated (only unsetting the bits),
+    /// first_bit_of_each_page, and the data; and we don't care about first_bit_of_each_page and the data.
+    ///
+    /// This is precarious, but I don't have better ideas. Note that this clearing (or something else)
+    /// must be done before unlocking the global_mutex because otherwise another call to getOrSet() might
+    /// return this chunk before we clear it.
+    chunk->pages_populated.unsetAll();
+}
+
+void PageCache::addMmap(std::unique_lock<std::mutex> & /* global_mutex */)
+{
+    /// ASLR by hand.
+    void * address_hint = reinterpret_cast<void *>(std::uniform_int_distribution<size_t>(0x100000000000UL, 0x700000000000UL)(rng));
+
+    mmaps.emplace_back(bytes_per_page, pages_per_chunk, pages_per_big_page, chunks_per_mmap_target, address_hint, use_huge_pages);
+
+    size_t num_chunks = mmaps.back().num_chunks;
+    total_chunks += num_chunks;
+    for (size_t i = 0; i < num_chunks; ++i)
+        /// Link in reverse order, so they get assigned in increasing order. Not important, just seems nice.
+        lru.push_front(mmaps.back().chunks[num_chunks - 1 - i]);
+}
+
+void PageCache::dropCache()
+{
+    std::unique_lock lock(global_mutex);
+
+    /// Detach and free unpinned chunks.
+    bool logged_error = false;
+    for (PageChunk & chunk : lru)
+    {
+        evictChunk(&chunk, lock);
+
+        if (use_madv_free)
+        {
+            /// This might happen in parallel with sendChunkToLimbo() or restoreChunkFromLimbo(), but it's ok.
+            int r = madvise(chunk.data, chunk.size, MADV_DONTNEED);
+            if (r != 0 && !logged_error)
+            {
+                logUnexpectedSyscallError("madvise(MADV_DONTNEED)");
+                logged_error = true;
+            }
+        }
+    }
+
+    /// Detach pinned chunks.
+    for (auto [key, chunk] : chunk_by_key)
+    {
+        chassert(chunk->key == key);
+        chassert(chunk->pin_count > 0); // otherwise it would have been evicted above
+        chunk->key.reset();
+    }
+    chunk_by_key.clear();
+}
+
+PageCache::Mmap::Mmap(size_t bytes_per_page_, size_t pages_per_chunk_, size_t pages_per_big_page_, size_t num_chunks_, void * address_hint, bool use_huge_pages_)
+{
+    num_chunks = num_chunks_;
+    size = bytes_per_page_ * pages_per_chunk_ * num_chunks;
+
+    size_t alignment = bytes_per_page_ * pages_per_big_page_;
+    address_hint = reinterpret_cast<void*>(reinterpret_cast<UInt64>(address_hint) / alignment * alignment);
+
+    auto temp_chunks = std::make_unique<PageChunk[]>(num_chunks);
+
+    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#ifdef OS_LINUX
+    flags |= MAP_NORESERVE;
+#endif
+    ptr = mmap(address_hint, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+    if (MAP_FAILED == ptr)
+        throw ErrnoException(ErrorCodes::CANNOT_ALLOCATE_MEMORY, fmt::format("Cannot mmap {}.", ReadableSize(size)));
+    if (reinterpret_cast<UInt64>(ptr) % bytes_per_page_ != 0)
+    {
+        munmap(ptr, size);
+        throw Exception(ErrorCodes::SYSTEM_ERROR, "mmap returned unaligned address: {}", ptr);
+    }
+
+    void * chunks_start = ptr;
+
+#ifdef OS_LINUX
+    if (madvise(ptr, size, MADV_DONTDUMP) != 0)
+        logUnexpectedSyscallError("madvise(MADV_DONTDUMP)");
+    if (madvise(ptr, size, MADV_DONTFORK) != 0)
+        logUnexpectedSyscallError("madvise(MADV_DONTFORK)");
+
+    if (use_huge_pages_)
+    {
+        if (reinterpret_cast<UInt64>(ptr) % alignment != 0)
+        {
+            LOG_DEBUG(&Poco::Logger::get("PageCache"), "mmap() returned address not aligned on huge page boundary.");
+            chunks_start = reinterpret_cast<void*>((reinterpret_cast<UInt64>(ptr) / alignment + 1) * alignment);
+            chassert(reinterpret_cast<UInt64>(chunks_start) % alignment == 0);
+            num_chunks -= 1;
+        }
+
+        if (madvise(ptr, size, MADV_HUGEPAGE) != 0)
+            LOG_WARNING(&Poco::Logger::get("PageCache"),
+                "madvise(MADV_HUGEPAGE) failed: {}. Userspace page cache will be relatively slow.", errnoToString());
+    }
+#else
+    (void)use_huge_pages_;
+#endif
+
+    chunks = std::move(temp_chunks);
+    for (size_t i = 0; i < num_chunks; ++i)
+    {
+        PageChunk * chunk = &chunks[i];
+        chunk->data = reinterpret_cast<char *>(chunks_start) + bytes_per_page_ * pages_per_chunk_ * i;
+        chunk->size = bytes_per_page_ * pages_per_chunk_;
+        chunk->page_size = bytes_per_page_;
+        chunk->big_page_size = bytes_per_page_ * pages_per_big_page_;
+        chunk->pages_populated.init(pages_per_chunk_);
+        chunk->first_bit_of_each_page.init(pages_per_chunk_);
+    }
+}
+
+PageCache::Mmap::Mmap(Mmap && m) noexcept : ptr(std::exchange(m.ptr, nullptr)), size(std::exchange(m.size, 0)), chunks(std::move(m.chunks)), num_chunks(std::exchange(m.num_chunks, 0)) {}
+
+PageCache::Mmap::~Mmap() noexcept
+{
+    if (ptr && 0 != munmap(ptr, size))
+        logUnexpectedSyscallError("munmap");
+}
+
+void FileChunkState::reset() {}
+
+PageCacheKey FileChunkAddress::hash() const
+{
+    SipHash hash(offset);
+    hash.update(path.data(), path.size());
+    if (!file_version.empty())
+    {
+        hash.update("\0", 1);
+        hash.update(file_version.data(), file_version.size());
+    }
+    return hash.get128();
+}
+
+std::string FileChunkAddress::toString() const
+{
+    return fmt::format("{}:{}{}{}", path, offset, file_version.empty() ? "" : ":", file_version);
+}
+
+#pragma clang diagnostic pop
+
+}
diff --git a/src/Common/PageCache.h b/src/Common/PageCache.h
new file mode 100644
index 00000000000..7ff376baa6b
--- /dev/null
+++ b/src/Common/PageCache.h
@@ -0,0 +1,299 @@
+#pragma once
+
+#include <boost/intrusive/list.hpp>
+#include <pcg_random.hpp>
+#include <Common/randomSeed.h>
+#include <Core/Types.h>
+#include <Common/HashTable/HashMap.h>
+
+/// "Userspace page cache"
+/// A cache for contents of remote files.
+/// Uses MADV_FREE to allow Linux to evict pages from our cache under memory pressure.
+/// Typically takes up almost all of the available memory, similar to the actual page cache.
+///
+/// Intended for caching data retrieved from distributed cache, but can be used for other things too,
+/// just replace FileChunkState with a discriminated union, or something, if needed.
+///
+/// There are two fixed-size units of caching here:
+///  * OS pages, typically 4 KiB each.
+///  * Page chunks, 2 MiB each (configurable with page_cache_block_size setting).
+///
+/// Each file is logically split into aligned 2 MiB blocks, which are mapped to page chunks inside the cache.
+/// They are cached independently from each other.
+///
+/// Each page chunk has a contiguous 2 MiB buffer that can be pinned and directly used e.g. by ReadBuffers.
+/// While pinned (by at least one PinnedPageChunk), the pages are not reclaimable by the OS.
+///
+/// Inside each page chunk, any subset of pages may be populated. Unpopulated pages may or not be
+/// mapped to any physical RAM. We maintain a bitmask that keeps track of which pages are populated.
+/// Pages become unpopulated if they're reclaimed by the OS (when the page chunk is not pinned),
+/// or if we just never populate them in the first place (e.g. if a file is shorter than 2 MiB we
+/// still create a 2 MiB page chunk, but use only a prefix of it).
+///
+/// There are two separate eviction mechanisms at play:
+///  * LRU eviction of page chunks in PageCache.
+///  * OS reclaiming pages on memory pressure. We have no control over the eviction policy.
+///    It probably picks the pages in the same order in which they were marked with MADV_FREE, so
+///    effectively in the same LRU order as our policy in PageCache.
+/// When using PageCache in oversubscribed fashion, using all available memory and relying on OS eviction,
+/// the PageCache's eviction policy mostly doesn't matter. It just needs to be similar enough to the OS's
+/// policy that we rarely evict chunks with unevicted pages.
+///
+/// We mmap memory directly instead of using allocator because this enables:
+///  * knowing how much RAM the cache is using, via /proc/self/smaps,
+///  * MADV_HUGEPAGE (use transparent huge pages - this makes MADV_FREE 10x less slow),
+///  * MAP_NORESERVE (don't reserve swap space - otherwise large mmaps usually fail),
+///  * MADV_DONTDUMP (don't include in core dumps),
+///  * page-aligned addresses without padding.
+///
+/// madvise(MADV_FREE) call is slow: ~6 GiB/s (doesn't scale with more threads). Enabling transparent
+/// huge pages (MADV_HUGEPAGE) makes it 10x less slow, so we do that. That makes the physical RAM allocation
+/// work at 2 MiB granularity instead of 4 KiB, so the cache becomes less suitable for small files.
+/// If this turns out to be a problem, we may consider allowing different mmaps to have different flags,
+/// some having no huge pages.
+/// Note that we do our bookkeeping at small-page granularity even if huge pages are enabled.
+///
+/// It's unfortunate that Linux's MADV_FREE eviction doesn't use the two-list strategy like the real
+/// page cache (IIUC, MADV_FREE puts the pages at the head of the inactive list, and they can never
+/// get to the active list).
+/// If this turns out to be a problem, we could make PageCache do chunk eviction based on observed
+/// system memory usage, so that most eviction is done by us, and the MADV_FREE eviction kicks in
+/// only as a last resort. Then we can make PageCache's eviction policy arbitrarily more sophisticated.
+
+namespace DB
+{
+
+/// Hash of FileChunkAddress.
+using PageCacheKey = UInt128;
+
+/// Identifies a chunk of a file or object.
+/// We assume that contents of such file/object don't change (without file_version changing), so
+/// cache invalidation is needed.
+struct FileChunkAddress
+{
+    /// Path, usually prefixed with storage system name and anything else needed to make it unique.
+    /// E.g. "s3:<bucket>/<path>"
+    std::string path;
+    /// Optional string with ETag, or file modification time, or anything else.
+    std::string file_version;
+    size_t offset = 0;
+
+    PageCacheKey hash() const;
+
+    std::string toString() const;
+};
+
+struct AtomicBitSet
+{
+    size_t n = 0;
+    std::unique_ptr<std::atomic<UInt8>[]> v;
+
+    AtomicBitSet();
+
+    void init(size_t n);
+
+    bool get(size_t i) const;
+    bool any() const;
+    /// These return true if the bit was changed, false if it already had the target value.
+    /// (These methods are logically not const, but clang insists that I make them const, and
+    ///  '#pragma clang diagnostic ignored' doesn't seem to work.)
+    bool set(size_t i) const;
+    bool set(size_t i, bool val) const;
+    bool unset(size_t i) const;
+    void unsetAll() const;
+};
+
+enum class PageChunkState
+{
+    /// Pages are not reclaimable by the OS, the buffer has correct contents.
+    Stable,
+    /// Pages are reclaimable by the OS, the buffer contents are altered (first bit of each page set to 1).
+    Limbo,
+};
+
+/// (This is a separate struct just in case we want to use this cache for other things in future.
+///  Then this struct would be the customization point, while the rest of PageChunk can stay unchanged.)
+struct FileChunkState
+{
+    std::mutex download_mutex;
+
+    void reset();
+};
+
+using PageChunkLRUListHook = boost::intrusive::list_base_hook<>;
+
+/// Cache entry.
+struct PageChunk : public PageChunkLRUListHook
+{
+    char * data;
+    size_t size; // in bytes
+    /// Page size for use in pages_populated and first_bit_of_each_page. Same as PageCache::pageSize().
+    size_t page_size;
+
+    /// Actual eviction granularity. Just for information. If huge pages are used, huge page size, otherwise page_size.
+    size_t big_page_size;
+
+    mutable FileChunkState state;
+
+    AtomicBitSet pages_populated;
+
+private:
+    friend class PinnedPageChunk;
+    friend class PageCache;
+
+    /// If nullopt, the chunk is "detached", i.e. not associated with any key.
+    /// Detached chunks may still be pinned. Chunk may get detached even while pinned, in particular when dropping cache.
+    /// Protected by global_mutex.
+    std::optional<PageCacheKey> key;
+
+    /// Refcount for usage of this chunk. When zero, the pages are reclaimable by the OS, and
+    /// the PageChunk itself is evictable (linked into PageCache::lru).
+    std::atomic<size_t> pin_count {0};
+
+    /// Bit mask containing the first bit of data from each page. Needed for the weird probing procedure when un-MADV_FREE-ing the pages.
+    AtomicBitSet first_bit_of_each_page;
+
+    /// Locked when changing pages_state, along with the corresponding expensive MADV_FREE/un-MADV_FREE operation.
+    mutable std::mutex chunk_mutex;
+
+    /// Normally pin_count == 0  <=>  state == PageChunkState::Limbo,
+    ///          pin_count >  0  <=>  state == PageChunkState::Stable.
+    /// This separate field is needed because of synchronization: pin_count is changed with global_mutex locked,
+    /// this field is changed with chunk_mutex locked, and we never have to lock both mutexes at once.
+    PageChunkState pages_state = PageChunkState::Stable;
+};
+
+class PageCache;
+
+/// Handle for a cache entry. Neither the entry nor its pages can get evicted while there's at least one PinnedPageChunk pointing to it.
+class PinnedPageChunk
+{
+public:
+    const PageChunk * getChunk() const;
+
+    /// Sets the bit in pages_populated. Returns true if it actually changed (i.e. was previously 0).
+    bool markPagePopulated(size_t page_idx);
+
+    /// Calls markPagePopulated() for pages 0..ceil(bytes/page_size).
+    void markPrefixPopulated(size_t bytes);
+
+    bool isPrefixPopulated(size_t bytes) const;
+
+    PinnedPageChunk() = default;
+    ~PinnedPageChunk() noexcept;
+
+    PinnedPageChunk(PinnedPageChunk &&) noexcept;
+    PinnedPageChunk & operator=(PinnedPageChunk &&) noexcept;
+
+private:
+    friend class PageCache;
+
+    PageCache * cache = nullptr;
+    PageChunk * chunk = nullptr;
+
+    PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept;
+};
+
+class PageCache
+{
+public:
+    PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages);
+    ~PageCache();
+
+    /// Get or insert a chunk for the given key.
+    ///
+    /// If detached_if_missing = true, and the key is not present in the cache, the returned chunk
+    /// won't be associated with the key and will be evicted as soon as it's unpinned.
+    /// It's like "get if exists, otherwise return null", but instead of null we return a usable
+    /// temporary buffer, for convenience. Pinning and page eviction make the story more complicated:
+    ///  * If the chunk for this key is pinned, we return it even if it's not fully populated
+    ///    (because PageCache doesn't know what "fully populated" means).
+    ///  * If the chunk exists, but some of its pages were evicted, we detach it. (Currently we only
+    ///    check the first page here.)
+    PinnedPageChunk getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction);
+
+    /// OS page size, e.g. 4 KiB on x86, 4 KiB or 64 KiB on aarch64.
+    ///
+    /// If transparent huge pages are enabled, this is still the regular page size, and all our bookkeeping
+    /// is still based on regular page size (e.g. pages_populated), because (a) it's cheap anyway,
+    /// and (b) I'm not sure if Linux guarantees that MADV_FREE reclamation always happens at huge page
+    /// granularity, and wouldn't want to rely on this even if it does.
+    size_t pageSize() const;
+    size_t chunkSize() const;
+    size_t maxChunks() const;
+
+    struct MemoryStats
+    {
+        /// How many bytes of actual RAM are used for the cache pages. Doesn't include metadata
+        /// and overhead (e.g. PageChunk structs).
+        size_t page_cache_rss = 0;
+        /// Resident set size for the whole process, excluding any MADV_FREE pages (PageCache's or not).
+        /// This can be used as a more useful memory usage number for clickhouse server, instead of RSS.
+        /// Populated only if MADV_FREE is used, otherwise zero.
+        std::optional<size_t> unreclaimable_rss;
+    };
+
+    /// Reads /proc/self/smaps, so not very fast.
+    MemoryStats getResidentSetSize() const;
+
+    /// Total length of memory ranges currently pinned by PinnedPageChunk-s, including unpopulated pages.
+    size_t getPinnedSize() const;
+
+    /// Clears the key -> chunk mapping. Frees memory (MADV_DONTNEED) of all chunks that are not pinned.
+    /// Doesn't unmap any virtual memory. Detaches but doesn't free the pinned chunks.
+    /// Locks the global mutex for the duration of the operation, which may block queries for hundreds of milliseconds.
+    void dropCache();
+
+private:
+    friend class PinnedPageChunk;
+
+    struct Mmap
+    {
+        void * ptr = nullptr;
+        size_t size = 0;
+
+        std::unique_ptr<PageChunk[]> chunks;
+        size_t num_chunks = 0; // might be smaller than chunks_per_mmap_target because of alignment
+
+        Mmap(Mmap &&) noexcept;
+        Mmap(size_t bytes_per_page, size_t pages_per_chunk, size_t pages_per_big_page, size_t num_chunks, void * address_hint, bool use_huge_pages_);
+        ~Mmap() noexcept;
+    };
+
+    size_t bytes_per_page;
+    size_t pages_per_chunk;
+    size_t chunks_per_mmap_target;
+    size_t max_mmaps;
+    size_t pages_per_big_page = 1; // if huge pages are used, huge_page_size/page_size, otherwise 1
+    bool use_madv_free = true;
+    bool use_huge_pages = true;
+
+    mutable std::mutex global_mutex;
+
+    pcg64 rng;
+
+    std::vector<Mmap> mmaps;
+    size_t total_chunks = 0;
+
+    /// All non-pinned chunks, including ones not assigned to any file. Least recently used is begin().
+    boost::intrusive::list<PageChunk, boost::intrusive::base_hook<PageChunkLRUListHook>, boost::intrusive::constant_time_size<true>> lru;
+
+    HashMap<PageCacheKey, PageChunk *> chunk_by_key;
+
+    /// Get a usable chunk, doing eviction or allocation if needed.
+    /// Caller is responsible for clearing pages_populated.
+    PageChunk * getFreeChunk(std::unique_lock<std::mutex> & /* global_mutex */);
+    void addMmap(std::unique_lock<std::mutex> & /* global_mutex */);
+    void evictChunk(PageChunk * chunk, std::unique_lock<std::mutex> & /* global_mutex */);
+
+    void removeRef(PageChunk * chunk) noexcept;
+
+    /// These may run in parallel with getFreeChunk(), so be very careful about which fields of the PageChunk we touch here.
+    void sendChunkToLimbo(PageChunk * chunk, std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept;
+    /// Returns {pages_restored, pages_evicted}.
+    std::pair<size_t, size_t> restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock<std::mutex> & /* chunk_mutex */) const noexcept;
+};
+
+using PageCachePtr = std::shared_ptr<PageCache>;
+
+}
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index d8ca1ab9e93..3a8659b8b27 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -63,6 +63,15 @@
     M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.") \
     M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.") \
     M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.") \
+    /* Each page cache chunk access increments exactly one of the following 5 PageCacheChunk* counters. */ \
+    /* Something like hit rate: (PageCacheChunkShared + PageCacheChunkDataHits) / [sum of all 5]. */ \
+    M(PageCacheChunkMisses, "Number of times a chunk has not been found in the userspace page cache.") \
+    M(PageCacheChunkShared, "Number of times a chunk has been found in the userspace page cache, already in use by another thread.") \
+    M(PageCacheChunkDataHits, "Number of times a chunk has been found in the userspace page cache, not in use, with all pages intact.") \
+    M(PageCacheChunkDataPartialHits, "Number of times a chunk has been found in the userspace page cache, not in use, but some of its pages were evicted by the OS.") \
+    M(PageCacheChunkDataMisses, "Number of times a chunk has been found in the userspace page cache, not in use, but all its pages were evicted by the OS.") \
+    M(PageCacheBytesUnpinnedRoundedToPages, "Total size of populated pages in chunks that became evictable in PageCache. Rounded up to whole pages.") \
+    M(PageCacheBytesUnpinnedRoundedToHugePages, "See PageCacheBytesUnpinnedRoundedToPages, but rounded to huge pages. Use the ratio between the two as a measure of memory waste from using huge pages.") \
     M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).") \
     M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).") \
     M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \
diff --git a/src/Core/Defines.h b/src/Core/Defines.h
index bf9fb1db6bc..cc6f49aa361 100644
--- a/src/Core/Defines.h
+++ b/src/Core/Defines.h
@@ -70,6 +70,15 @@ static constexpr auto DBMS_DEFAULT_MAX_QUERY_SIZE = 262144;
 /// Max depth of hierarchical dictionary
 static constexpr auto DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH = 1000;
 
+#ifdef OS_LINUX
+#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE true
+#else
+/// On Mac OS, MADV_FREE is not lazy, so page_cache_use_madv_free should be disabled.
+/// On FreeBSD, it may work but we haven't tested it.
+#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE false
+#endif
+
+
 /// Default maximum (total and entry) sizes and policies of various caches
 static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_POLICY = "SLRU";
 static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB;
diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h
index 3713d0c3206..a54fb42b464 100644
--- a/src/Core/ServerSettings.h
+++ b/src/Core/ServerSettings.h
@@ -65,7 +65,7 @@ namespace DB
     M(UInt64, max_concurrent_insert_queries, 0, "Maximum number of concurrently INSERT queries. Zero means unlimited.", 0) \
     M(UInt64, max_concurrent_select_queries, 0, "Maximum number of concurrently SELECT queries. Zero means unlimited.", 0) \
     \
-    M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size ro RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \
+    M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size to RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \
     M(String, uncompressed_cache_policy, DEFAULT_UNCOMPRESSED_CACHE_POLICY, "Uncompressed cache policy name.", 0) \
     M(UInt64, uncompressed_cache_size, DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE, "Size of cache for uncompressed blocks. Zero means disabled.", 0) \
     M(Double, uncompressed_cache_size_ratio, DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO, "The size of the protected queue in the uncompressed cache relative to the cache's total size.", 0) \
@@ -78,6 +78,11 @@ namespace DB
     M(String, index_mark_cache_policy, DEFAULT_INDEX_MARK_CACHE_POLICY, "Secondary index mark cache policy name.", 0) \
     M(UInt64, index_mark_cache_size, DEFAULT_INDEX_MARK_CACHE_MAX_SIZE, "Size of cache for secondary index marks. Zero means disabled.", 0) \
     M(Double, index_mark_cache_size_ratio, DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO, "The size of the protected queue in the secondary index mark cache relative to the cache's total size.", 0) \
+    M(UInt64, page_cache_chunk_size, 2 << 20, "Bytes per chunk in userspace page cache. Rounded up to a multiple of page size (typically 4 KiB) or huge page size (typically 2 MiB, only if page_cache_use_thp is enabled).", 0) \
+    M(UInt64, page_cache_mmap_size, 1 << 30, "Bytes per memory mapping in userspace page cache. Not important.", 0) \
+    M(UInt64, page_cache_size, 10ul << 30, "Amount of virtual memory to map for userspace page cache. If page_cache_use_madv_free is enabled, it's recommended to set this higher than the machine's RAM size. Use 0 to disable userspace page cache.", 0) \
+    M(Bool, page_cache_use_madv_free, DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE, "If true, the userspace page cache will allow the OS to automatically reclaim memory from the cache on memory pressure (using MADV_FREE).", 0) \
+    M(Bool, page_cache_use_transparent_huge_pages, true, "Userspace will attempt to use transparent huge pages on Linux. This is best-effort.", 0) \
     M(UInt64, mmap_cache_size, DEFAULT_MMAP_CACHE_MAX_SIZE, "A cache for mmapped files.", 0) \
     \
     M(Bool,   disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index ae6ea165cc9..7d1112af3a7 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -777,6 +777,10 @@ class IColumn;
     M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \
     M(UInt64, filesystem_cache_segments_batch_size, 20, "Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache", 0) \
     \
+    M(Bool, use_page_cache_for_disks_without_file_cache, false, "Use userspace page cache for remote disks that don't have filesystem cache enabled.", 0) \
+    M(Bool, read_from_page_cache_if_exists_otherwise_bypass_cache, false, "Use userspace page cache in passive mode, similar to read_from_filesystem_cache_if_exists_otherwise_bypass_cache.", 0) \
+    M(Bool, page_cache_inject_eviction, false, "Userspace page cache will sometimes invalidate some pages at random. Intended for testing.", 0) \
+    \
     M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \
     M(Bool, enable_filesystem_read_prefetches_log, false, "Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default", 0) \
     M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefetched threadpool if all parts are on remote filesystem", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index e8d013d13ec..02ee641903c 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -114,6 +114,9 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"},
               {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."},
               {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."},
+              {"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"},
+              {"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"},
+              {"page_cache_inject_eviction", false, false, "Added userspace page cache"},
               }},
     {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."},
               {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"},
diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp
index 2373640704b..1a9cd2c994c 100644
--- a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp
+++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp
@@ -129,6 +129,7 @@ void AsynchronousBoundedReadBuffer::setReadUntilPosition(size_t position)
                 /// new read until position is after the current position in the working buffer
                 file_offset_of_buffer_end = position;
                 working_buffer.resize(working_buffer.size() - (file_offset_of_buffer_end - position));
+                pos = std::min(pos, working_buffer.end());
             }
             else
             {
@@ -235,9 +236,6 @@ bool AsynchronousBoundedReadBuffer::nextImpl()
 
     file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd();
 
-    /// In case of multiple files for the same file in clickhouse (i.e. log family)
-    /// file_offset_of_buffer_end will not match getImplementationBufferOffset()
-    /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()]
     chassert(file_offset_of_buffer_end <= impl->getFileSize());
 
     if (read_until_position && (file_offset_of_buffer_end > *read_until_position))
@@ -264,7 +262,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence)
     size_t new_pos;
     if (whence == SEEK_SET)
     {
-        assert(offset >= 0);
+        chassert(offset >= 0);
         new_pos = offset;
     }
     else if (whence == SEEK_CUR)
@@ -290,8 +288,8 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence)
             /// Position is still inside the buffer.
             /// Probably it is at the end of the buffer - then we will load data on the following 'next' call.
             pos = working_buffer.end() - file_offset_of_buffer_end + new_pos;
-            assert(pos >= working_buffer.begin());
-            assert(pos <= working_buffer.end());
+            chassert(pos >= working_buffer.begin());
+            chassert(pos <= working_buffer.end());
 
             return new_pos;
         }
@@ -317,7 +315,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence)
         break;
     }
 
-    assert(!prefetch_future.valid());
+    chassert(!prefetch_future.valid());
 
     /// First reset the buffer so the next read will fetch new data to the buffer.
     resetWorkingBuffer();
diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
index 7ce3d58dcd8..47ee5858562 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
@@ -1215,7 +1215,7 @@ size_t CachedOnDiskReadBufferFromFile::getRemainingSizeToRead()
 
 void CachedOnDiskReadBufferFromFile::setReadUntilPosition(size_t position)
 {
-    if (!allow_seeks_after_first_read)
+    if (initialized && !allow_seeks_after_first_read)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Method `setReadUntilPosition()` not allowed");
 
     if (read_until_position == position)
diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
index 0b3ecca3587..417f7615dd7 100644
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@@ -5,6 +5,7 @@
 #include <Disks/IO/CachedOnDiskReadBufferFromFile.h>
 #include <Disks/ObjectStorages/Cached/CachedObjectStorage.h>
 #include <Interpreters/Cache/FileCache.h>
+#include <IO/CachedInMemoryReadBufferFromFile.h>
 #include <IO/ReadSettings.h>
 #include <IO/SwapHelper.h>
 #include <Interpreters/FilesystemCacheLog.h>
@@ -16,12 +17,16 @@ using namespace DB;
 
 namespace
 {
-bool withCache(const ReadSettings & settings)
+bool withFileCache(const ReadSettings & settings)
 {
     return settings.remote_fs_cache && settings.enable_filesystem_cache
         && (!CurrentThread::getQueryId().empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache
             || !settings.avoid_readthrough_cache_outside_query_context);
 }
+bool withPageCache(const ReadSettings & settings, bool with_file_cache)
+{
+    return settings.page_cache && !with_file_cache && settings.use_page_cache_for_disks_without_file_cache;
+}
 }
 
 namespace DB
@@ -34,7 +39,7 @@ namespace ErrorCodes
 size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_t file_size)
 {
     /// Only when cache is used we could download bigger portions of FileSegments than what we actually gonna read within particular task.
-    if (!withCache(settings))
+    if (!withFileCache(settings))
         return settings.remote_fs_buffer_size;
 
     /// Buffers used for prefetch and pre-download better to have enough size, but not bigger than the whole file.
@@ -44,27 +49,30 @@ size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_
 ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather(
     ReadBufferCreator && read_buffer_creator_,
     const StoredObjects & blobs_to_read_,
+    const std::string & cache_path_prefix_,
     const ReadSettings & settings_,
     std::shared_ptr<FilesystemCacheLog> cache_log_,
     bool use_external_buffer_)
-    : ReadBufferFromFileBase(
-        use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading(settings_, getTotalSize(blobs_to_read_)), nullptr, 0)
+    : ReadBufferFromFileBase(use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading(
+        settings_, getTotalSize(blobs_to_read_)), nullptr, 0)
     , settings(settings_)
     , blobs_to_read(blobs_to_read_)
     , read_buffer_creator(std::move(read_buffer_creator_))
+    , cache_path_prefix(cache_path_prefix_)
     , cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr)
     , query_id(CurrentThread::getQueryId())
     , use_external_buffer(use_external_buffer_)
-    , with_cache(withCache(settings))
+    , with_file_cache(withFileCache(settings))
+    , with_page_cache(withPageCache(settings, with_file_cache))
     , log(getLogger("ReadBufferFromRemoteFSGather"))
 {
     if (!blobs_to_read.empty())
         current_object = blobs_to_read.front();
 }
 
-SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object)
+SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object, size_t start_offset)
 {
-    if (current_buf && !with_cache)
+    if (current_buf && !with_file_cache)
     {
         appendUncachedReadInfo();
     }
@@ -72,30 +80,45 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c
     current_object = object;
     const auto & object_path = object.remote_path;
 
-    size_t current_read_until_position = read_until_position ? read_until_position : object.bytes_size;
-    auto current_read_buffer_creator = [=, this]() { return read_buffer_creator(object_path, current_read_until_position); };
+    std::unique_ptr<ReadBufferFromFileBase> buf;
 
 #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD
-    if (with_cache)
+    if (with_file_cache)
     {
         auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path);
-        return std::make_shared<CachedOnDiskReadBufferFromFile>(
+        buf = std::make_unique<CachedOnDiskReadBufferFromFile>(
             object_path,
             cache_key,
             settings.remote_fs_cache,
             FileCache::getCommonUser(),
-            std::move(current_read_buffer_creator),
+            [=, this]() { return read_buffer_creator(/* restricted_seek */true, object_path); },
             settings,
             query_id,
             object.bytes_size,
             /* allow_seeks */false,
             /* use_external_buffer */true,
-            read_until_position ? std::optional<size_t>(read_until_position) : std::nullopt,
+            /* read_until_position */std::nullopt,
             cache_log);
     }
 #endif
 
-    return current_read_buffer_creator();
+    /// Can't wrap CachedOnDiskReadBufferFromFile in CachedInMemoryReadBufferFromFile because the
+    /// former doesn't support seeks.
+    if (with_page_cache && !buf)
+    {
+        auto inner = read_buffer_creator(/* restricted_seek */false, object_path);
+        auto cache_key = FileChunkAddress { .path = cache_path_prefix + object_path };
+        buf = std::make_unique<CachedInMemoryReadBufferFromFile>(
+            cache_key, settings.page_cache, std::move(inner), settings);
+    }
+
+    if (!buf)
+        buf = read_buffer_creator(/* restricted_seek */true, object_path);
+
+    if (read_until_position > start_offset && read_until_position < start_offset + object.bytes_size)
+        buf->setReadUntilPosition(read_until_position - start_offset);
+
+    return buf;
 }
 
 void ReadBufferFromRemoteFSGather::appendUncachedReadInfo()
@@ -124,12 +147,12 @@ void ReadBufferFromRemoteFSGather::initialize()
         return;
 
     /// One clickhouse file can be split into multiple files in remote fs.
-    auto current_buf_offset = file_offset_of_buffer_end;
+    size_t start_offset = 0;
     for (size_t i = 0; i < blobs_to_read.size(); ++i)
     {
         const auto & object = blobs_to_read[i];
 
-        if (object.bytes_size > current_buf_offset)
+        if (start_offset + object.bytes_size > file_offset_of_buffer_end)
         {
             LOG_TEST(log, "Reading from file: {} ({})", object.remote_path, object.local_path);
 
@@ -137,14 +160,14 @@ void ReadBufferFromRemoteFSGather::initialize()
             if (!current_buf || current_buf_idx != i)
             {
                 current_buf_idx = i;
-                current_buf = createImplementationBuffer(object);
+                current_buf = createImplementationBuffer(object, start_offset);
             }
 
-            current_buf->seek(current_buf_offset, SEEK_SET);
+            current_buf->seek(file_offset_of_buffer_end - start_offset, SEEK_SET);
             return;
         }
 
-        current_buf_offset -= object.bytes_size;
+        start_offset += object.bytes_size;
     }
     current_buf_idx = blobs_to_read.size();
     current_buf = nullptr;
@@ -171,14 +194,14 @@ bool ReadBufferFromRemoteFSGather::nextImpl()
 bool ReadBufferFromRemoteFSGather::moveToNextBuffer()
 {
     /// If there is no available buffers - nothing to read.
-    if (current_buf_idx + 1 >= blobs_to_read.size())
+    if (current_buf_idx + 1 >= blobs_to_read.size() || (read_until_position && file_offset_of_buffer_end >= read_until_position))
         return false;
 
     ++current_buf_idx;
 
     const auto & object = blobs_to_read[current_buf_idx];
     LOG_TEST(log, "Reading from next file: {} ({})", object.remote_path, object.local_path);
-    current_buf = createImplementationBuffer(object);
+    current_buf = createImplementationBuffer(object, file_offset_of_buffer_end);
 
     return true;
 }
@@ -263,7 +286,7 @@ off_t ReadBufferFromRemoteFSGather::seek(off_t offset, int whence)
 
 ReadBufferFromRemoteFSGather::~ReadBufferFromRemoteFSGather()
 {
-    if (!with_cache)
+    if (!with_file_cache)
         appendUncachedReadInfo();
 }
 
diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h
index f6b7506a54f..8362b354e23 100644
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h
@@ -21,11 +21,12 @@ class ReadBufferFromRemoteFSGather final : public ReadBufferFromFileBase
 friend class ReadIndirectBufferFromRemoteFS;
 
 public:
-    using ReadBufferCreator = std::function<std::unique_ptr<ReadBufferFromFileBase>(const std::string & path, size_t read_until_position)>;
+    using ReadBufferCreator = std::function<std::unique_ptr<ReadBufferFromFileBase>(bool restricted_seek, const std::string & path)>;
 
     ReadBufferFromRemoteFSGather(
         ReadBufferCreator && read_buffer_creator_,
         const StoredObjects & blobs_to_read_,
+        const std::string & cache_path_prefix_,
         const ReadSettings & settings_,
         std::shared_ptr<FilesystemCacheLog> cache_log_,
         bool use_external_buffer_);
@@ -53,7 +54,7 @@ public:
     bool isContentCached(size_t offset, size_t size) override;
 
 private:
-    SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object);
+    SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object, size_t start_offset);
 
     bool nextImpl() override;
 
@@ -70,10 +71,12 @@ private:
     const ReadSettings settings;
     const StoredObjects blobs_to_read;
     const ReadBufferCreator read_buffer_creator;
+    const std::string cache_path_prefix;
     const std::shared_ptr<FilesystemCacheLog> cache_log;
     const String query_id;
     const bool use_external_buffer;
-    const bool with_cache;
+    const bool with_file_cache;
+    const bool with_page_cache;
 
     size_t read_until_position = 0;
     size_t file_offset_of_buffer_end = 0;
diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
index f3caf62ffd5..590fc4c4656 100644
--- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
@@ -152,6 +152,8 @@ IAsynchronousReader::Result ThreadPoolRemoteFSReader::execute(Request request, b
     IAsynchronousReader::Result read_result;
     if (result)
     {
+        chassert(reader.buffer().begin() == request.buf);
+        chassert(reader.buffer().end() <= request.buf + request.size);
         read_result.size = reader.buffer().size();
         read_result.offset = reader.offset();
         ProfileEvents::increment(ProfileEvents::ThreadpoolReaderReadBytes, read_result.size);
diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h
index abc251b2b10..eacce5a54ac 100644
--- a/src/Disks/IO/ThreadPoolRemoteFSReader.h
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h
@@ -29,6 +29,9 @@ private:
 class RemoteFSFileDescriptor : public IAsynchronousReader::IFileDescriptor
 {
 public:
+    /// `reader_` implementation must ensure that next() places data at the start of internal_buffer,
+    /// even if there was previously a seek. I.e. seek() shouldn't leave pending data (no short seek
+    /// optimization), and nextImpl() shouldn't assign nextimpl_working_buffer_offset.
     explicit RemoteFSFileDescriptor(
         SeekableReadBuffer & reader_,
         std::shared_ptr<AsyncReadCounters> async_read_counters_)
diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
index 74389aedb64..136f69ab729 100644
--- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
@@ -206,7 +206,7 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
 
     auto read_buffer_creator =
         [this, settings_ptr, disk_read_settings]
-        (const std::string & path, size_t read_until_position) -> std::unique_ptr<ReadBufferFromFileBase>
+        (bool restricted_seek, const std::string & path) -> std::unique_ptr<ReadBufferFromFileBase>
     {
         return std::make_unique<ReadBufferFromAzureBlobStorage>(
             client.get(),
@@ -215,8 +215,7 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
             settings_ptr->max_single_read_retries,
             settings_ptr->max_single_download_retries,
             /* use_external_buffer */true,
-            /* restricted_seek */true,
-            read_until_position);
+            restricted_seek);
     };
 
     switch (read_settings.remote_fs_method)
@@ -226,16 +225,17 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
             return std::make_unique<ReadBufferFromRemoteFSGather>(
                 std::move(read_buffer_creator),
                 objects,
+                "azure:",
                 disk_read_settings,
                 global_context->getFilesystemCacheLog(),
                 /* use_external_buffer */false);
-
         }
         case RemoteFSReadMethod::threadpool:
         {
             auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
                 std::move(read_buffer_creator),
                 objects,
+                "azure:",
                 disk_read_settings,
                 global_context->getFilesystemCacheLog(),
                 /* use_external_buffer */true);
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
index 2a648f28f14..16183ec20c1 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@@ -527,10 +527,9 @@ std::unique_ptr<ReadBufferFromFileBase> DiskObjectStorage::readFile(
     std::optional<size_t> read_hint,
     std::optional<size_t> file_size) const
 {
-    auto storage_objects = metadata_storage->getStorageObjects(path);
+    const auto storage_objects = metadata_storage->getStorageObjects(path);
 
     const bool file_can_be_empty = !file_size.has_value() || *file_size == 0;
-
     if (storage_objects.empty() && file_can_be_empty)
         return std::make_unique<ReadBufferFromEmptyFile>();
 
diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
index fa5e227d853..f8545ecfe39 100644
--- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
@@ -60,7 +60,7 @@ std::unique_ptr<ReadBufferFromFileBase> HDFSObjectStorage::readObjects( /// NOLI
     auto disk_read_settings = patchSettings(read_settings);
     auto read_buffer_creator =
         [this, disk_read_settings]
-        (const std::string & path, size_t /* read_until_position */) -> std::unique_ptr<ReadBufferFromFileBase>
+        (bool /* restricted_seek */, const std::string & path) -> std::unique_ptr<ReadBufferFromFileBase>
     {
         size_t begin_of_path = path.find('/', path.find("//") + 2);
         auto hdfs_path = path.substr(begin_of_path);
@@ -71,7 +71,7 @@ std::unique_ptr<ReadBufferFromFileBase> HDFSObjectStorage::readObjects( /// NOLI
     };
 
     return std::make_unique<ReadBufferFromRemoteFSGather>(
-        std::move(read_buffer_creator), objects, disk_read_settings, nullptr, /* use_external_buffer */false);
+        std::move(read_buffer_creator), objects, "hdfs:", disk_read_settings, nullptr, /* use_external_buffer */false);
 }
 
 std::unique_ptr<WriteBufferFromFileBase> HDFSObjectStorage::writeObject( /// NOLINT
diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp
index 02700b358e0..7fd4536f266 100644
--- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp
@@ -47,7 +47,7 @@ std::unique_ptr<ReadBufferFromFileBase> LocalObjectStorage::readObjects( /// NOL
     auto modified_settings = patchSettings(read_settings);
     auto global_context = Context::getGlobalContextInstance();
     auto read_buffer_creator =
-        [=] (const std::string & file_path, size_t /* read_until_position */)
+        [=] (bool /* restricted_seek */, const std::string & file_path)
         -> std::unique_ptr<ReadBufferFromFileBase>
     {
         return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size);
@@ -58,13 +58,13 @@ std::unique_ptr<ReadBufferFromFileBase> LocalObjectStorage::readObjects( /// NOL
         case RemoteFSReadMethod::read:
         {
             return std::make_unique<ReadBufferFromRemoteFSGather>(
-                std::move(read_buffer_creator), objects, modified_settings,
+                std::move(read_buffer_creator), objects, "file:", modified_settings,
                 global_context->getFilesystemCacheLog(), /* use_external_buffer */false);
         }
         case RemoteFSReadMethod::threadpool:
         {
             auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
-                std::move(read_buffer_creator), objects, modified_settings,
+                std::move(read_buffer_creator), objects, "file:", modified_settings,
                 global_context->getFilesystemCacheLog(), /* use_external_buffer */true);
 
             auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
index 5771eb1ebe0..d89c7c93e51 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@@ -171,7 +171,7 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
 
     auto read_buffer_creator =
         [this, settings_ptr, disk_read_settings]
-        (const std::string & path, size_t read_until_position) -> std::unique_ptr<ReadBufferFromFileBase>
+        (bool restricted_seek, const std::string & path) -> std::unique_ptr<ReadBufferFromFileBase>
     {
         return std::make_unique<ReadBufferFromS3>(
             client.get(),
@@ -182,8 +182,8 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
             disk_read_settings,
             /* use_external_buffer */true,
             /* offset */0,
-            read_until_position,
-            /* restricted_seek */true);
+            /* read_until_position */0,
+            restricted_seek);
     };
 
     switch (read_settings.remote_fs_method)
@@ -193,16 +193,17 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
             return std::make_unique<ReadBufferFromRemoteFSGather>(
                 std::move(read_buffer_creator),
                 objects,
+                "s3:" + uri.bucket + "/",
                 disk_read_settings,
                 global_context->getFilesystemCacheLog(),
                 /* use_external_buffer */false);
-
         }
         case RemoteFSReadMethod::threadpool:
         {
             auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
                 std::move(read_buffer_creator),
                 objects,
+                "s3:" + uri.bucket + "/",
                 disk_read_settings,
                 global_context->getFilesystemCacheLog(),
                 /* use_external_buffer */true);
diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
index 786b23caf48..48de0bf4168 100644
--- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
@@ -252,14 +252,13 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
 {
     auto read_buffer_creator =
          [this, read_settings]
-         (const std::string & path_, size_t read_until_position) -> std::unique_ptr<ReadBufferFromFileBase>
+         (bool /* restricted_seek */, const std::string & path_) -> std::unique_ptr<ReadBufferFromFileBase>
      {
          return std::make_unique<ReadBufferFromWebServer>(
              fs::path(url) / path_,
              getContext(),
              read_settings,
-             /* use_external_buffer */true,
-             read_until_position);
+             /* use_external_buffer */true);
      };
 
     auto global_context = Context::getGlobalContextInstance();
@@ -271,6 +270,7 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
             return std::make_unique<ReadBufferFromRemoteFSGather>(
                 std::move(read_buffer_creator),
                 StoredObjects{object},
+                "url:" + url + "/",
                 read_settings,
                 global_context->getFilesystemCacheLog(),
                 /* use_external_buffer */false);
@@ -280,6 +280,7 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
             auto impl = std::make_unique<ReadBufferFromRemoteFSGather>(
                 std::move(read_buffer_creator),
                 StoredObjects{object},
+                "url:" + url + "/",
                 read_settings,
                 global_context->getFilesystemCacheLog(),
                 /* use_external_buffer */true);
diff --git a/src/IO/AsynchronousReader.h b/src/IO/AsynchronousReader.h
index 279a399caad..f9590b4419f 100644
--- a/src/IO/AsynchronousReader.h
+++ b/src/IO/AsynchronousReader.h
@@ -54,6 +54,9 @@ public:
 
     struct Result
     {
+        /// The read data is at [buf + offset, buf + size), where `buf` is from Request struct.
+        /// (Notice that `offset` is included in `size`.)
+
         /// size
         /// Less than requested amount of data can be returned.
         /// If size is zero - the file has ended.
diff --git a/src/IO/BufferBase.h b/src/IO/BufferBase.h
index 4c0a467b155..1a087dd87fa 100644
--- a/src/IO/BufferBase.h
+++ b/src/IO/BufferBase.h
@@ -60,6 +60,9 @@ public:
     BufferBase(Position ptr, size_t size, size_t offset)
         : pos(ptr + offset), working_buffer(ptr, ptr + size), internal_buffer(ptr, ptr + size) {}
 
+    /// Assign the buffers and pos.
+    /// Be careful when calling this from ReadBuffer::nextImpl() implementations: `offset` is
+    /// effectively ignored because ReadBuffer::next() reassigns `pos`.
     void set(Position ptr, size_t size, size_t offset)
     {
         internal_buffer = Buffer(ptr, ptr + size);
diff --git a/src/IO/CachedInMemoryReadBufferFromFile.cpp b/src/IO/CachedInMemoryReadBufferFromFile.cpp
new file mode 100644
index 00000000000..384d2229f14
--- /dev/null
+++ b/src/IO/CachedInMemoryReadBufferFromFile.cpp
@@ -0,0 +1,188 @@
+#include "CachedInMemoryReadBufferFromFile.h"
+#include <IO/SwapHelper.h>
+#include <base/scope_guard.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNEXPECTED_END_OF_FILE;
+    extern const int CANNOT_SEEK_THROUGH_FILE;
+    extern const int SEEK_POSITION_OUT_OF_BOUND;
+}
+
+CachedInMemoryReadBufferFromFile::CachedInMemoryReadBufferFromFile(
+    FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr<ReadBufferFromFileBase> in_, const ReadSettings & settings_)
+    : ReadBufferFromFileBase(0, nullptr, 0, in_->getFileSize()), cache_key(cache_key_), cache(cache_), settings(settings_), in(std::move(in_))
+    , read_until_position(file_size.value())
+{
+    cache_key.offset = 0;
+}
+
+String CachedInMemoryReadBufferFromFile::getFileName() const
+{
+    return in->getFileName();
+}
+
+off_t CachedInMemoryReadBufferFromFile::seek(off_t off, int whence)
+{
+    if (whence != SEEK_SET)
+        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed.");
+
+    size_t offset = static_cast<size_t>(off);
+    if (offset > file_size.value())
+        throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", off);
+
+    if (offset >= file_offset_of_buffer_end - working_buffer.size() && offset <= file_offset_of_buffer_end)
+    {
+        pos = working_buffer.end() - (file_offset_of_buffer_end - offset);
+        chassert(getPosition() == off);
+        return off;
+    }
+
+    resetWorkingBuffer();
+
+    file_offset_of_buffer_end = offset;
+    chunk.reset();
+
+    chassert(getPosition() == off);
+    return off;
+}
+
+off_t CachedInMemoryReadBufferFromFile::getPosition()
+{
+    return file_offset_of_buffer_end - available();
+}
+
+size_t CachedInMemoryReadBufferFromFile::getFileOffsetOfBufferEnd() const
+{
+    return file_offset_of_buffer_end;
+}
+
+void CachedInMemoryReadBufferFromFile::setReadUntilPosition(size_t position)
+{
+    read_until_position = position;
+    if (position < static_cast<size_t>(getPosition()))
+    {
+        resetWorkingBuffer();
+        chunk.reset();
+    }
+    else if (position < file_offset_of_buffer_end)
+    {
+        size_t diff = file_offset_of_buffer_end - position;
+        working_buffer.resize(working_buffer.size() - diff);
+        file_offset_of_buffer_end -= diff;
+    }
+}
+
+void CachedInMemoryReadBufferFromFile::setReadUntilEnd()
+{
+    setReadUntilPosition(file_size.value());
+}
+
+bool CachedInMemoryReadBufferFromFile::nextImpl()
+{
+    chassert(read_until_position <= file_size.value());
+    if (file_offset_of_buffer_end >= read_until_position)
+        return false;
+
+    if (chunk.has_value() && file_offset_of_buffer_end >= cache_key.offset + cache->chunkSize())
+    {
+        chassert(file_offset_of_buffer_end == cache_key.offset + cache->chunkSize());
+        chunk.reset();
+    }
+
+    if (!chunk.has_value())
+    {
+        cache_key.offset = file_offset_of_buffer_end / cache->chunkSize() * cache->chunkSize();
+        chunk = cache->getOrSet(cache_key.hash(), settings.read_from_page_cache_if_exists_otherwise_bypass_cache, settings.page_cache_inject_eviction);
+
+        size_t chunk_size = std::min(cache->chunkSize(), file_size.value() - cache_key.offset);
+
+        std::unique_lock download_lock(chunk->getChunk()->state.download_mutex);
+
+        if (!chunk->isPrefixPopulated(chunk_size))
+        {
+            /// A few things could be improved here, which may or may not be worth the added complexity:
+            ///  * If the next file chunk is in cache, use in->setReadUntilPosition() to limit the read to
+            ///    just one chunk. More generally, look ahead in the cache to count how many next chunks
+            ///    need to be downloaded. (Up to some limit? And avoid changing `in`'s until-position if
+            ///    it's already reasonable; otherwise we'd increase it by one chunk every chunk, discarding
+            ///    a half-completed HTTP request every time.)
+            ///  * If only a subset of pages are missing from this chunk, download only them,
+            ///    with some threshold for avoiding short seeks.
+            ///    In particular, if a previous download failed in the middle of the chunk, we could
+            ///    resume from that position instead of from the beginning of the chunk.
+            ///    (It's also possible in principle that a proper subset of chunk's pages was reclaimed
+            ///    by the OS. But, for performance purposes, we should completely ignore that, because
+            ///    (a) PageCache normally uses 2 MiB transparent huge pages and has just one such page
+            ///    per chunk, and (b) even with 4 KiB pages partial chunk eviction is extremely rare.)
+            ///  * If our [position, read_until_position) covers only part of the chunk, we could download
+            ///    just that part. (Which would be bad if someone else needs the rest of the chunk and has
+            ///    to do a whole new HTTP request to get it. Unclear what the policy should be.)
+            ///  * Instead of doing in->next() in a loop until we get the whole chunk, we could return the
+            ///    results as soon as in->next() produces them.
+            ///    (But this would make the download_mutex situation much more complex, similar to the
+            ///    FileSegment::State::PARTIALLY_DOWNLOADED and FileSegment::setRemoteFileReader() stuff.)
+
+            Buffer prev_in_buffer = in->internalBuffer();
+            SCOPE_EXIT({ in->set(prev_in_buffer.begin(), prev_in_buffer.size()); });
+
+            size_t pos = 0;
+            while (pos < chunk_size)
+            {
+                char * piece_start = chunk->getChunk()->data + pos;
+                size_t piece_size = chunk_size - pos;
+                in->set(piece_start, piece_size);
+                LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, in {:x}, path {}, size {}, offset {:x}, pos {:x}", reinterpret_cast<uint64_t>(this), reinterpret_cast<uint64_t>(in.get()), cache_key.path, file_size.value(), cache_key.offset, pos);
+                if (pos == 0)
+                    in->seek(cache_key.offset, SEEK_SET);
+                else
+                    chassert(!in->available());
+
+                if (in->eof())
+                    throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "File {} ended after {} bytes, but we expected {}",
+                        getFileName(), cache_key.offset + pos, file_size.value());
+
+                chassert(in->position() >= piece_start && in->buffer().end() <= piece_start + piece_size);
+                chassert(in->getPosition() == static_cast<off_t>(cache_key.offset + pos));
+
+                size_t n = in->available();
+                chassert(n);
+                if (in->position() != piece_start)
+                    memmove(piece_start, in->position(), n);
+                in->position() += n;
+                pos += n;
+                LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, got {:x} bytes", reinterpret_cast<uint64_t>(this), n);
+            }
+
+            chunk->markPrefixPopulated(chunk_size);
+        }
+    }
+
+    nextimpl_working_buffer_offset = file_offset_of_buffer_end - cache_key.offset;
+    working_buffer = Buffer(
+        chunk->getChunk()->data,
+        chunk->getChunk()->data + std::min(chunk->getChunk()->size, read_until_position - cache_key.offset));
+    pos = working_buffer.begin() + nextimpl_working_buffer_offset;
+
+    if (!internal_buffer.empty())
+    {
+        /// We were given an external buffer to read into. Copy the data into it.
+        /// Would be nice to avoid this copy, somehow, maybe by making ReadBufferFromRemoteFSGather
+        /// and AsynchronousBoundedReadBuffer explicitly aware of the page cache.
+        size_t n = std::min(available(), internal_buffer.size());
+        memcpy(internal_buffer.begin(), pos, n);
+        working_buffer = Buffer(internal_buffer.begin(), internal_buffer.begin() + n);
+        pos = working_buffer.begin();
+        nextimpl_working_buffer_offset = 0;
+    }
+
+    file_offset_of_buffer_end += available();
+
+    return true;
+}
+
+}
diff --git a/src/IO/CachedInMemoryReadBufferFromFile.h b/src/IO/CachedInMemoryReadBufferFromFile.h
new file mode 100644
index 00000000000..300c2e82386
--- /dev/null
+++ b/src/IO/CachedInMemoryReadBufferFromFile.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <Common/PageCache.h>
+#include <IO/ReadBufferFromFileBase.h>
+
+namespace DB
+{
+
+class CachedInMemoryReadBufferFromFile : public ReadBufferFromFileBase
+{
+public:
+    /// `in_` must support using external buffer. I.e. we assign its internal_buffer before each next()
+    /// call and expect the read data to be put into that buffer.
+    /// `in_` should be seekable and should be able to read the whole file from 0 to in_->getFileSize();
+    /// if you set `in_`'s read-until-position bypassing CachedInMemoryReadBufferFromFile then
+    /// CachedInMemoryReadBufferFromFile will break.
+    CachedInMemoryReadBufferFromFile(FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr<ReadBufferFromFileBase> in_, const ReadSettings & settings_);
+
+    String getFileName() const override;
+    off_t seek(off_t off, int whence) override;
+    off_t getPosition() override;
+    size_t getFileOffsetOfBufferEnd() const override;
+    bool supportsRightBoundedReads() const override { return true; }
+    void setReadUntilPosition(size_t position) override;
+    void setReadUntilEnd() override;
+
+private:
+    FileChunkAddress cache_key; // .offset is offset of `chunk` start
+    PageCachePtr cache;
+    ReadSettings settings;
+    std::unique_ptr<ReadBufferFromFileBase> in;
+
+    size_t file_offset_of_buffer_end = 0;
+    size_t read_until_position;
+
+    std::optional<PinnedPageChunk> chunk;
+
+    bool nextImpl() override;
+};
+
+}
diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h
index b45bc8f3dbc..00325734354 100644
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@@ -225,11 +225,22 @@ public:
      *  - seek() to a position above the until position (even if you setReadUntilPosition() to a
      *    higher value right after the seek!),
      *
-     * Typical implementations discard any current buffers and connections, even if the position is
-     * adjusted only a little.
+     * Implementations are recommended to:
+     *  - Allow the read-until-position to go below current position, e.g.:
+     *      // Read block [300, 400)
+     *      setReadUntilPosition(400);
+     *      seek(300);
+     *      next();
+     *      // Read block [100, 200)
+     *      setReadUntilPosition(200); // oh oh, this is below the current position, but should be allowed
+     *      seek(100); // but now everything's fine again
+     *      next();
+     *      // (Swapping the order of seek and setReadUntilPosition doesn't help: then it breaks if the order of blocks is reversed.)
+     *  - Check if new read-until-position value is equal to the current value and do nothing in this case,
+     *    so that the caller doesn't have to.
      *
-     * Typical usage is to call it right after creating the ReadBuffer, before it started doing any
-     * work.
+     * Typical implementations discard any current buffers and connections when the
+     * read-until-position changes even by a small (nonzero) amount.
      */
     virtual void setReadUntilPosition(size_t /* position */) {}
 
diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h
index c397689d6ad..f4dc7880be4 100644
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@@ -61,6 +61,7 @@ enum class RemoteFSReadMethod
 };
 
 class MMappedFileCache;
+class PageCache;
 
 struct ReadSettings
 {
@@ -102,6 +103,12 @@ struct ReadSettings
     bool avoid_readthrough_cache_outside_query_context = true;
     size_t filesystem_cache_segments_batch_size = 20;
 
+    //asdqwe assign these two
+    bool use_page_cache_for_disks_without_file_cache = false;
+    bool read_from_page_cache_if_exists_otherwise_bypass_cache = false;
+    bool page_cache_inject_eviction = false;
+    std::shared_ptr<PageCache> page_cache;
+
     size_t filesystem_cache_max_download_size = (128UL * 1024 * 1024 * 1024);
     bool skip_download_if_exceeds_query_cache = true;
 
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 8304a876fb1..53fd7d9b45f 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -17,6 +17,7 @@
 #include <Common/getMultipleKeysFromConfig.h>
 #include <Common/callOnce.h>
 #include <Common/SharedLockGuard.h>
+#include <Common/PageCache.h>
 #include <Coordination/KeeperDispatcher.h>
 #include <Core/BackgroundSchedulePool.h>
 #include <Formats/FormatFactory.h>
@@ -294,6 +295,7 @@ struct ContextSharedPart : boost::noncopyable
     mutable MarkCachePtr index_mark_cache TSA_GUARDED_BY(mutex);                      /// Cache of marks in compressed files of MergeTree indices.
     mutable MMappedFileCachePtr mmap_cache TSA_GUARDED_BY(mutex);                     /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads.
     AsynchronousMetrics * asynchronous_metrics TSA_GUARDED_BY(mutex) = nullptr;       /// Points to asynchronous metrics
+    mutable PageCachePtr page_cache TSA_GUARDED_BY(mutex);                            /// Userspace page cache.
     ProcessList process_list;                                   /// Executing queries at the moment.
     SessionTracker session_tracker;
     GlobalOvercommitTracker global_overcommit_tracker;
@@ -1228,7 +1230,7 @@ void Context::setUser(const UUID & user_id_, const std::optional<const std::vect
 {
     /// Prepare lists of user's profiles, constraints, settings, roles.
     /// NOTE: AccessControl::read<User>() and other AccessControl's functions may require some IO work,
-    /// so Context::getLock() must be unlocked while we're doing this.
+    /// so Context::getLocalLock() and Context::getGlobalLock() must be unlocked while we're doing this.
 
     auto & access_control = getAccessControl();
     auto user = access_control.read<User>(user_id_);
@@ -1358,7 +1360,7 @@ void Context::checkAccess(const AccessRightsElements & elements) const { return
 
 std::shared_ptr<const ContextAccess> Context::getAccess() const
 {
-    /// A helper function to collect parameters for calculating access rights, called with Context::getLock() acquired.
+    /// A helper function to collect parameters for calculating access rights, called with Context::getLocalSharedLock() acquired.
     auto get_params = [this]()
     {
         /// If setUserID() was never called then this must be the global context with the full access.
@@ -1385,7 +1387,8 @@ std::shared_ptr<const ContextAccess> Context::getAccess() const
     }
 
     /// Calculate new access rights according to the collected parameters.
-    /// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLock() must be unlocked while we're doing this.
+    /// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLocalLock()
+    ///       and Context::getGlobalLock() must be unlocked while we're doing this.
     auto res = getAccessControl().getContextAccess(*params);
 
     {
@@ -2714,6 +2717,33 @@ void Context::clearUncompressedCache() const
         shared->uncompressed_cache->clear();
 }
 
+void Context::setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages)
+{
+    std::lock_guard lock(shared->mutex);
+
+    if (shared->page_cache)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Page cache has been already created.");
+
+    shared->page_cache = std::make_shared<PageCache>(bytes_per_chunk, bytes_per_mmap, bytes_total, use_madv_free, use_huge_pages);
+}
+
+PageCachePtr Context::getPageCache() const
+{
+    SharedLockGuard lock(shared->mutex);
+    return shared->page_cache;
+}
+
+void Context::dropPageCache() const
+{
+    PageCachePtr cache;
+    {
+        SharedLockGuard lock(shared->mutex);
+        cache = shared->page_cache;
+    }
+    if (cache)
+        cache->dropCache();
+}
+
 void Context::setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio)
 {
     std::lock_guard lock(shared->mutex);
@@ -5130,6 +5160,11 @@ ReadSettings Context::getReadSettings() const
     res.filesystem_cache_max_download_size = settings.filesystem_cache_max_download_size;
     res.skip_download_if_exceeds_query_cache = settings.skip_download_if_exceeds_query_cache;
 
+    res.page_cache = getPageCache();
+    res.use_page_cache_for_disks_without_file_cache = settings.use_page_cache_for_disks_without_file_cache;
+    res.read_from_page_cache_if_exists_otherwise_bypass_cache = settings.read_from_page_cache_if_exists_otherwise_bypass_cache;
+    res.page_cache_inject_eviction = settings.page_cache_inject_eviction;
+
     res.remote_read_min_bytes_for_seek = settings.remote_read_min_bytes_for_seek;
 
     /// Zero read buffer will not make progress.
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 7bbff9c63bb..ec5a044b28f 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -79,6 +79,7 @@ class RefreshSet;
 class Cluster;
 class Compiler;
 class MarkCache;
+class PageCache;
 class MMappedFileCache;
 class UncompressedCache;
 class ProcessList;
@@ -968,6 +969,10 @@ public:
     std::shared_ptr<UncompressedCache> getUncompressedCache() const;
     void clearUncompressedCache() const;
 
+    void setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages);
+    std::shared_ptr<PageCache> getPageCache() const;
+    void dropPageCache() const;
+
     void setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio);
     void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config);
     std::shared_ptr<MarkCache> getMarkCache() const;
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index a078d99facf..4bb47a8c9e3 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -10,6 +10,7 @@
 #include <Common/ShellCommand.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/FailPoint.h>
+#include <Common/PageCache.h>
 #include <Interpreters/Cache/FileCacheFactory.h>
 #include <Interpreters/Cache/FileCache.h>
 #include <Interpreters/Context.h>
@@ -460,6 +461,13 @@ BlockIO InterpreterSystemQuery::execute()
         {
             throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Not implemented");
         }
+        case Type::DROP_PAGE_CACHE:
+        {
+            getContext()->checkAccess(AccessType::SYSTEM_DROP_PAGE_CACHE);
+
+            getContext()->dropPageCache();
+            break;
+        }
         case Type::DROP_SCHEMA_CACHE:
         {
             getContext()->checkAccess(AccessType::SYSTEM_DROP_SCHEMA_CACHE);
@@ -1201,6 +1209,7 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
         case Type::DROP_INDEX_UNCOMPRESSED_CACHE:
         case Type::DROP_FILESYSTEM_CACHE:
         case Type::SYNC_FILESYSTEM_CACHE:
+        case Type::DROP_PAGE_CACHE:
         case Type::DROP_SCHEMA_CACHE:
         case Type::DROP_FORMAT_SCHEMA_CACHE:
         case Type::DROP_S3_CLIENT_CACHE:
diff --git a/src/Interpreters/ServerAsynchronousMetrics.cpp b/src/Interpreters/ServerAsynchronousMetrics.cpp
index bdf314f35b9..fe7ccd64ffe 100644
--- a/src/Interpreters/ServerAsynchronousMetrics.cpp
+++ b/src/Interpreters/ServerAsynchronousMetrics.cpp
@@ -9,6 +9,8 @@
 #include <Interpreters/Cache/QueryCache.h>
 #include <Interpreters/JIT/CompiledExpressionCache.h>
 
+#include <Common/PageCache.h>
+
 #include <Databases/IDatabase.h>
 
 #include <IO/UncompressedCache.h>
@@ -77,6 +79,16 @@ void ServerAsynchronousMetrics::updateImpl(TimePoint update_time, TimePoint curr
         new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" };
     }
 
+    if (auto page_cache = getContext()->getPageCache())
+    {
+        auto rss = page_cache->getResidentSetSize();
+        new_values["PageCacheBytes"] = { rss.page_cache_rss, "Userspace page cache memory usage in bytes" };
+        new_values["PageCachePinnedBytes"] = { page_cache->getPinnedSize(), "Userspace page cache memory that's currently in use and can't be evicted" };
+
+        if (rss.unreclaimable_rss.has_value())
+            new_values["UnreclaimableRSS"] = { *rss.unreclaimable_rss, "The amount of physical memory used by the server process, in bytes, excluding memory reclaimable by the OS (MADV_FREE)" };
+    }
+
     if (auto uncompressed_cache = getContext()->getUncompressedCache())
     {
         new_values["UncompressedCacheBytes"] = { uncompressed_cache->sizeInBytes(),
diff --git a/src/Interpreters/tests/gtest_page_cache.cpp b/src/Interpreters/tests/gtest_page_cache.cpp
new file mode 100644
index 00000000000..1e2688c0ca2
--- /dev/null
+++ b/src/Interpreters/tests/gtest_page_cache.cpp
@@ -0,0 +1,267 @@
+#include <Common/PageCache.h>
+#include <gtest/gtest.h>
+#include <thread>
+
+#ifdef OS_LINUX
+#include <sys/sysinfo.h>
+#endif
+
+using namespace DB;
+
+namespace ProfileEvents
+{
+    extern const Event PageCacheChunkMisses;
+    extern const Event PageCacheChunkShared;
+    extern const Event PageCacheChunkDataHits;
+    extern const Event PageCacheChunkDataPartialHits;
+    extern const Event PageCacheChunkDataMisses;
+}
+
+#define CHECK(x)                                                                           \
+    do {                                                                                   \
+    if (!(x))                                                                              \
+        {                                                                                  \
+            std::cerr << "check on line " << __LINE__ << " failed: " << #x << std::endl;   \
+            std::abort();                                                                  \
+        }                                                                                  \
+    } while (false)
+
+size_t estimateRAMSize()
+{
+#ifdef OS_LINUX
+    struct sysinfo info;
+    int r = sysinfo(&info);
+    CHECK(r == 0);
+    return static_cast<size_t>(info.totalram * info.mem_unit);
+#else
+    return 128ul << 30;
+#endif
+}
+
+/// Do random reads and writes in PageCache from multiple threads, check that the data read matches the data written.
+TEST(PageCache, DISABLED_Stress)
+{
+    /// There doesn't seem to be a reasonable way to simulate memory pressure or force the eviction of MADV_FREE-d pages.
+    /// So we actually map more virtual memory than we have RAM and fill it all up a few times.
+    /// This takes an eternity (a few minutes), but idk how else to hit MADV_FREE eviction.
+    /// Expect ~1 GB/s, bottlenecked by page faults.
+    size_t ram_size = estimateRAMSize();
+    PageCache cache(2 << 20, 1 << 30, ram_size + ram_size / 10, /* use_madv_free */ true, /* use_huge_pages */ true);
+
+    CHECK(cache.getResidentSetSize().page_cache_rss);
+
+    const size_t num_keys = static_cast<size_t>(cache.maxChunks() * 1.5);
+    const size_t pages_per_chunk = cache.chunkSize() / cache.pageSize();
+    const size_t items_per_page = cache.pageSize() / 8;
+
+    const size_t passes = 2;
+    const size_t step = 20;
+    const size_t num_threads = 20;
+    const size_t chunks_touched = num_keys * passes * num_threads / step;
+    std::atomic<size_t> progress {0};
+    std::atomic<size_t> threads_finished {0};
+
+    std::atomic<size_t> total_racing_writes {0};
+
+    auto thread_func = [&]
+    {
+        pcg64 rng(randomSeed());
+        std::vector<PinnedPageChunk> pinned;
+
+        /// Stats.
+        size_t racing_writes = 0;
+
+        for (size_t i = 0; i < num_keys * passes; i += step)
+        {
+            progress += 1;
+
+            /// Touch the chunks sequentially + noise (to increase interference across threads), or at random 10% of the time.
+            size_t key_idx;
+            if (rng() % 10 == 0)
+                key_idx = std::uniform_int_distribution<size_t>(0, num_keys - 1)(rng);
+            else
+                key_idx = (i + std::uniform_int_distribution<size_t>(0, num_keys / 1000)(rng)) % num_keys;
+
+            /// For some keys, always use detached_if_missing = true and check that cache always misses.
+            bool key_detached_if_missing = key_idx % 100 == 42;
+            bool detached_if_missing = key_detached_if_missing || i % 101 == 42;
+
+            PageCacheKey key = key_idx * 0xcafebabeb0bad00dul; // a simple reversible hash (the constant can be any odd number)
+
+            PinnedPageChunk chunk = cache.getOrSet(key, detached_if_missing, /* inject_eviction */ false);
+
+            if (key_detached_if_missing)
+                CHECK(!chunk.getChunk()->pages_populated.any());
+
+            for (size_t page_idx = 0; page_idx < pages_per_chunk; ++page_idx)
+            {
+                bool populated = chunk.getChunk()->pages_populated.get(page_idx);
+                /// Generate page contents deterministically from key and page index.
+                size_t start = key_idx * page_idx;
+                if (start % 37 == 13)
+                {
+                    /// Leave ~1/37 of the pages unpopulated.
+                    CHECK(!populated);
+                }
+                else
+                {
+                    /// We may write/read the same memory from multiple threads in parallel here.
+                    std::atomic<size_t> * items = reinterpret_cast<std::atomic<size_t> *>(chunk.getChunk()->data + cache.pageSize() * page_idx);
+                    if (populated)
+                    {
+                        for (size_t j = 0; j < items_per_page; ++j)
+                            CHECK(items[j].load(std::memory_order_relaxed) == start + j);
+                    }
+                    else
+                    {
+                        for (size_t j = 0; j < items_per_page; ++j)
+                            items[j].store(start + j, std::memory_order_relaxed);
+                        if (!chunk.markPagePopulated(page_idx))
+                            racing_writes += 1;
+                    }
+                }
+            }
+
+            pinned.push_back(std::move(chunk));
+            CHECK(cache.getPinnedSize() >= cache.chunkSize());
+            /// Unpin 2 chunks on average.
+            while (rng() % 3 != 0 && !pinned.empty())
+            {
+                size_t idx = rng() % pinned.size();
+                if (idx != pinned.size() - 1)
+                    pinned[idx] = std::move(pinned.back());
+                pinned.pop_back();
+            }
+        }
+
+        total_racing_writes += racing_writes;
+        threads_finished += 1;
+    };
+
+    std::cout << fmt::format("doing {:.1f} passes over {:.1f} GiB of virtual memory\nthis will take a few minutes, progress printed every 10 seconds",
+        chunks_touched * 1. / cache.maxChunks(), cache.maxChunks() * cache.chunkSize() * 1. / (1ul << 30)) << std::endl;
+
+    auto start_time = std::chrono::steady_clock::now();
+
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < num_threads; ++i)
+        threads.emplace_back(thread_func);
+
+    for (size_t poll = 0;; ++poll)
+    {
+        if (threads_finished == num_threads)
+            break;
+        if (poll % 100 == 0)
+            std::cout << fmt::format("{:.3f}%", progress.load() * 100. / num_keys / passes / num_threads * step) << std::endl;
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+    for (std::thread & t : threads)
+        t.join();
+
+    auto end_time = std::chrono::steady_clock::now();
+    double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time).count();
+    double touched_gib = chunks_touched * cache.chunkSize() * 1. / (1ul << 30);
+    std::cout << fmt::format("touched {:.1f} GiB in {:.1f} seconds, that's {:.3f} GiB/s",
+        touched_gib, elapsed_seconds, touched_gib / elapsed_seconds) << std::endl;
+
+    auto & counters = CurrentThread::getProfileEvents();
+
+    std::cout << "stats:"
+        << "\nchunk misses: " << counters[ProfileEvents::PageCacheChunkMisses].load()
+        << "\nchunk shared: " << counters[ProfileEvents::PageCacheChunkShared].load()
+        << "\nchunk data misses: " << counters[ProfileEvents::PageCacheChunkDataMisses].load()
+        << "\nchunk data partial hits: " << counters[ProfileEvents::PageCacheChunkDataPartialHits].load()
+        << "\nchunk data hits: " << counters[ProfileEvents::PageCacheChunkDataHits].load()
+        << "\nracing page writes: " << total_racing_writes << std::endl;
+
+    /// Check that we at least hit all the cases.
+    CHECK(counters[ProfileEvents::PageCacheChunkMisses].load() > 0);
+    CHECK(counters[ProfileEvents::PageCacheChunkShared].load() > 0);
+    CHECK(counters[ProfileEvents::PageCacheChunkDataMisses].load() > 0);
+    /// Partial hits are rare enough that sometimes this is zero, so don't check it.
+    /// That's good news because we don't need to implement downloading parts of a chunk.
+    /// CHECK(counters[ProfileEvents::PageCacheChunkDataPartialHits].load() > 0);
+    CHECK(counters[ProfileEvents::PageCacheChunkDataHits].load() > 0);
+    CHECK(total_racing_writes > 0);
+    CHECK(cache.getPinnedSize() == 0);
+
+    size_t rss = cache.getResidentSetSize().page_cache_rss;
+    std::cout << "RSS: " << rss * 1. / (1ul << 30) << " GiB" << std::endl;
+    /// This can be flaky if the system has < 10% free memory. If this turns out to be a problem, feel free to remove or reduce.
+    CHECK(rss > ram_size / 10);
+
+    cache.dropCache();
+
+#ifdef OS_LINUX
+    /// MADV_DONTNEED is not synchronous, and we're freeing lots of pages. Let's give Linux a lot of time.
+    std::this_thread::sleep_for(std::chrono::seconds(10));
+    size_t new_rss = cache.getResidentSetSize().page_cache_rss;
+    std::cout << "RSS after dropping cache: " << new_rss * 1. / (1ul << 30) << " GiB" << std::endl;
+    CHECK(new_rss < rss / 2);
+#endif
+}
+
+/// Benchmark that measures the PageCache overhead for cache hits. Doesn't touch the actual data, so
+/// memory bandwidth mostly doesn't factor into this.
+/// This measures the overhead of things like madvise(MADV_FREE) and probing the pages (restoreChunkFromLimbo()).
+/// Disabled in CI, run manually with --gtest_also_run_disabled_tests --gtest_filter=PageCache.DISABLED_HitsBench
+TEST(PageCache, DISABLED_HitsBench)
+{
+    /// Do a few runs, with and without MADV_FREE.
+    for (size_t num_threads = 1; num_threads <= 16; num_threads *= 2)
+    {
+        for (size_t run = 0; run < 8; ++ run)
+        {
+            bool use_madv_free = run % 2 == 1;
+            bool use_huge_pages = run % 4 / 2 == 1;
+
+            PageCache cache(2 << 20, 1ul << 30, 20ul << 30, use_madv_free, use_huge_pages);
+            size_t passes = 3;
+            std::atomic<size_t> total_misses {0};
+
+            /// Prepopulate all chunks.
+            for (size_t i = 0; i < cache.maxChunks(); ++i)
+            {
+                PageCacheKey key = i * 0xcafebabeb0bad00dul;
+                PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false);
+                memset(chunk.getChunk()->data, 42, chunk.getChunk()->size);
+                chunk.markPrefixPopulated(cache.chunkSize());
+            }
+
+            auto thread_func = [&]
+            {
+                pcg64 rng(randomSeed());
+                size_t misses = 0;
+                for (size_t i = 0; i < cache.maxChunks() * passes; ++i)
+                {
+                    PageCacheKey key = rng() % cache.maxChunks() * 0xcafebabeb0bad00dul;
+                    PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false);
+                    if (!chunk.isPrefixPopulated(cache.chunkSize()))
+                        misses += 1;
+                }
+                total_misses += misses;
+            };
+
+            auto start_time = std::chrono::steady_clock::now();
+
+            std::vector<std::thread> threads;
+            for (size_t i = 0; i < num_threads; ++i)
+                threads.emplace_back(thread_func);
+
+            for (std::thread & t : threads)
+                t.join();
+
+            auto end_time = std::chrono::steady_clock::now();
+            double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time).count();
+            double fetched_gib = cache.chunkSize() * cache.maxChunks() * passes * 1. / (1ul << 30);
+            std::cout << fmt::format(
+                "threads {}, run {}, use_madv_free = {}, use_huge_pages = {}\nrequested {:.1f} GiB in {:.1f} seconds\n"
+                "that's {:.1f} GiB/s, or overhead of {:.3}us/{:.1}MiB\n",
+                num_threads, run, use_madv_free, use_huge_pages, fetched_gib, elapsed_seconds, fetched_gib / elapsed_seconds,
+                elapsed_seconds * 1e6 / cache.maxChunks() / passes, cache.chunkSize() * 1. / (1 << 20)) << std::endl;
+
+            if (total_misses != 0)
+                std::cout << "!got " << total_misses.load() << " misses! perhaps your system doesn't have enough free memory, consider decreasing cache size in the benchmark code" << std::endl;
+        }
+    }
+}
diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h
index 9aa90f499d0..48be7f6b84f 100644
--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@@ -31,6 +31,7 @@ public:
         DROP_COMPILED_EXPRESSION_CACHE,
         DROP_FILESYSTEM_CACHE,
         DROP_DISK_METADATA_CACHE,
+        DROP_PAGE_CACHE,
         DROP_SCHEMA_CACHE,
         DROP_FORMAT_SCHEMA_CACHE,
         DROP_S3_CLIENT_CACHE,
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 39ad28d3dae..a9bdceacef0 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -1637,10 +1637,6 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const
         size_t file_size = getDataPartStorage().getFileSize(TXN_VERSION_METADATA_FILE_NAME);
         auto buf = getDataPartStorage().readFile(TXN_VERSION_METADATA_FILE_NAME, ReadSettings().adjustBufferSize(file_size), file_size, std::nullopt);
 
-        /// FIXME https://github.com/ClickHouse/ClickHouse/issues/48465
-        if (dynamic_cast<CachedOnDiskReadBufferFromFile *>(buf.get()))
-            return true;
-
         readStringUntilEOF(content, *buf);
         ReadBufferFromString str_buf{content};
         VersionMetadata file;
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index da90dbb4076..53a18d3cc5b 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -727,7 +727,7 @@ std::unique_ptr<ReadBuffer> StorageS3Source::createAsyncS3ReadBuffer(
     auto context = getContext();
     auto read_buffer_creator =
         [this, read_settings, object_size]
-        (const std::string & path, size_t read_until_position) -> std::unique_ptr<ReadBufferFromFileBase>
+        (bool restricted_seek, const std::string & path) -> std::unique_ptr<ReadBufferFromFileBase>
     {
         return std::make_unique<ReadBufferFromS3>(
             client,
@@ -738,21 +738,25 @@ std::unique_ptr<ReadBuffer> StorageS3Source::createAsyncS3ReadBuffer(
             read_settings,
             /* use_external_buffer */true,
             /* offset */0,
-            read_until_position,
-            /* restricted_seek */true,
+            /* read_until_position */0,
+            restricted_seek,
             object_size);
     };
 
+    auto modified_settings{read_settings};
+    /// User's S3 object may change, don't cache it.
+    modified_settings.use_page_cache_for_disks_without_file_cache = false;
+
+    /// FIXME: Changing this setting to default value breaks something around parquet reading
+    modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size;
+
     auto s3_impl = std::make_unique<ReadBufferFromRemoteFSGather>(
         std::move(read_buffer_creator),
         StoredObjects{StoredObject{key, /* local_path */ "", object_size}},
+        "",
         read_settings,
         /* cache_log */nullptr, /* use_external_buffer */true);
 
-    auto modified_settings{read_settings};
-    /// FIXME: Changing this setting to default value breaks something around parquet reading
-    modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size;
-
     auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER);
     auto async_reader = std::make_unique<AsynchronousBoundedReadBuffer>(
         std::move(s3_impl), pool_reader, modified_settings,
diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index f438c6f4f31..d44c80bc410 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -699,6 +699,8 @@ class SettingsRandomizer:
                 get_localzone(),
             ]
         ),
+        "use_page_cache_for_disks_without_file_cache": lambda: random.random() < 0.7,
+        "page_cache_inject_eviction": lambda: random.random() < 0.5,
     }
 
     @staticmethod
diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference
index e1f5213790d..88f18c52536 100644
--- a/tests/queries/0_stateless/01271_show_privileges.reference
+++ b/tests/queries/0_stateless/01271_show_privileges.reference
@@ -112,6 +112,7 @@ SYSTEM DROP QUERY CACHE	['SYSTEM DROP QUERY','DROP QUERY CACHE','DROP QUERY']	GL
 SYSTEM DROP COMPILED EXPRESSION CACHE	['SYSTEM DROP COMPILED EXPRESSION','DROP COMPILED EXPRESSION CACHE','DROP COMPILED EXPRESSIONS']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP FILESYSTEM CACHE	['SYSTEM DROP FILESYSTEM CACHE','DROP FILESYSTEM CACHE']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM SYNC FILESYSTEM CACHE	['SYSTEM REPAIR FILESYSTEM CACHE','REPAIR FILESYSTEM CACHE','SYNC FILESYSTEM CACHE']	GLOBAL	SYSTEM
+SYSTEM DROP PAGE CACHE	['SYSTEM DROP PAGE CACHE','DROP PAGE CACHE']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP SCHEMA CACHE	['SYSTEM DROP SCHEMA CACHE','DROP SCHEMA CACHE']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP FORMAT SCHEMA CACHE	['SYSTEM DROP FORMAT SCHEMA CACHE','DROP FORMAT SCHEMA CACHE']	GLOBAL	SYSTEM DROP CACHE
 SYSTEM DROP S3 CLIENT CACHE	['SYSTEM DROP S3 CLIENT','DROP S3 CLIENT CACHE']	GLOBAL	SYSTEM DROP CACHE
diff --git a/tests/queries/0_stateless/02867_page_cache.reference b/tests/queries/0_stateless/02867_page_cache.reference
new file mode 100644
index 00000000000..5502059508a
--- /dev/null
+++ b/tests/queries/0_stateless/02867_page_cache.reference
@@ -0,0 +1,23 @@
+54975576145920
+PageCacheBytesUnpinnedRoundedToHugePages	1
+PageCacheBytesUnpinnedRoundedToPages	1
+PageCacheChunkMisses	1
+ReadBufferFromS3Bytes	1
+54975576145920
+PageCacheBytesUnpinnedRoundedToHugePages	1
+PageCacheBytesUnpinnedRoundedToPages	1
+PageCacheChunkDataHits	1
+54975576145920
+PageCacheBytesUnpinnedRoundedToHugePages	1
+PageCacheBytesUnpinnedRoundedToPages	1
+PageCacheChunkMisses	1
+ReadBufferFromS3Bytes	1
+54975576145920
+PageCacheBytesUnpinnedRoundedToHugePages	1
+PageCacheBytesUnpinnedRoundedToPages	1
+PageCacheChunkMisses	1
+ReadBufferFromS3Bytes	1
+54975576145920
+PageCacheBytesUnpinnedRoundedToHugePages	1
+PageCacheBytesUnpinnedRoundedToPages	1
+PageCacheChunkDataHits	1
diff --git a/tests/queries/0_stateless/02867_page_cache.sql b/tests/queries/0_stateless/02867_page_cache.sql
new file mode 100644
index 00000000000..8765b30ebc3
--- /dev/null
+++ b/tests/queries/0_stateless/02867_page_cache.sql
@@ -0,0 +1,105 @@
+-- Tags: no-fasttest, no-parallel
+-- no-fasttest because we need an S3 storage policy
+-- no-parallel because we look at server-wide counters about page cache usage
+
+set use_page_cache_for_disks_without_file_cache = 1;
+set page_cache_inject_eviction = 0;
+set enable_filesystem_cache = 0;
+set use_uncompressed_cache = 0;
+
+create table events_snapshot engine Memory as select * from system.events;
+create view events_diff as
+    -- round all stats to 70 MiB to leave a lot of leeway for overhead
+    with if(event like '%Bytes%', 70*1024*1024, 35) as granularity,
+    -- cache hits counter can vary a lot depending on other settings:
+    -- e.g. if merge_tree_min_bytes_for_concurrent_read is small, multiple threads will read each chunk
+    -- so we just check that the value is not too low
+         if(event in (
+            'PageCacheBytesUnpinnedRoundedToPages', 'PageCacheBytesUnpinnedRoundedToHugePages',
+            'PageCacheChunkDataHits'), 1, 1000) as clamp
+    select event, min2(intDiv(new.value - old.value, granularity), clamp) as diff
+    from system.events new
+    left outer join events_snapshot old
+    on old.event = new.event
+    where diff != 0 and
+          event in (
+            'ReadBufferFromS3Bytes', 'PageCacheChunkMisses', 'PageCacheChunkDataMisses',
+            'PageCacheChunkDataHits', 'PageCacheChunkDataPartialHits',
+            'PageCacheBytesUnpinnedRoundedToPages', 'PageCacheBytesUnpinnedRoundedToHugePages')
+    order by event;
+
+drop table if exists page_cache_03055;
+create table page_cache_03055 (k Int64 CODEC(NONE)) engine MergeTree order by k settings storage_policy = 's3_cache';
+
+-- Write an 80 MiB file (40 x 2 MiB chunks), and a few small files.
+system stop merges page_cache_03055;
+insert into page_cache_03055 select * from numbers(10485760) settings max_block_size=100000000, preferred_block_size_bytes=1000000000;
+
+select * from events_diff;
+truncate table events_snapshot;
+insert into events_snapshot select * from system.events;
+
+system start merges page_cache_03055;
+optimize table page_cache_03055 final;
+truncate table events_snapshot;
+insert into events_snapshot select * from system.events;
+
+-- Cold read, should miss cache. (Populating cache on write is not implemented yet.)
+
+select sum(k) from page_cache_03055;
+
+select * from events_diff where event not in ('PageCacheChunkDataHits');
+truncate table events_snapshot;
+insert into events_snapshot select * from system.events;
+
+-- Repeat read, should hit cache.
+
+select sum(k) from page_cache_03055;
+
+select * from events_diff;
+truncate table events_snapshot;
+insert into events_snapshot select * from system.events;
+
+-- Drop cache and read again, should miss. Also don't write to cache.
+
+system drop page cache;
+
+select sum(k) from page_cache_03055 settings read_from_page_cache_if_exists_otherwise_bypass_cache = 1;
+
+-- Data could be read multiple times because we're not writing to cache.
+select event, if(event in ('PageCacheChunkMisses', 'ReadBufferFromS3Bytes'), diff >= 1, diff) from events_diff where event not in ('PageCacheChunkDataHits');
+truncate table events_snapshot;
+insert into events_snapshot select * from system.events;
+
+-- Repeat read, should still miss, but populate cache.
+
+select sum(k) from page_cache_03055;
+
+select * from events_diff where event not in ('PageCacheChunkDataHits');
+truncate table events_snapshot;
+insert into events_snapshot select * from system.events;
+
+-- Read again, hit the cache.
+
+select sum(k) from page_cache_03055 settings read_from_page_cache_if_exists_otherwise_bypass_cache = 1;
+
+select * from events_diff;
+truncate table events_snapshot;
+insert into events_snapshot select * from system.events;
+
+
+-- Known limitation: cache is not invalidated if a table is dropped and created again at the same path.
+-- set allow_deprecated_database_ordinary=1;
+-- create database test_03055 engine = Ordinary;
+-- create table test_03055.t (k Int64) engine MergeTree order by k settings storage_policy = 's3_cache';
+-- insert into test_03055.t values (1);
+-- select * from test_03055.t;
+-- drop table test_03055.t;
+-- create table test_03055.t (k Int64) engine MergeTree order by k settings storage_policy = 's3_cache';
+-- insert into test_03055.t values (2);
+-- select * from test_03055.t;
+
+
+drop table events_snapshot;
+drop table page_cache_03055;
+drop view events_diff;

From 227e3b58067ebaabc281673601408fedd135a5a2 Mon Sep 17 00:00:00 2001
From: Michael Kolupaev <michael.kolupaev@clickhouse.com>
Date: Thu, 29 Feb 2024 12:26:33 +0000
Subject: [PATCH 21/66] Conflict

---
 src/Parsers/ASTSystemQuery.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp
index 0713737af95..63311a70e42 100644
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@@ -415,6 +415,7 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState & s
         case Type::STOP_THREAD_FUZZER:
         case Type::START_VIEWS:
         case Type::STOP_VIEWS:
+        case Type::DROP_PAGE_CACHE:
             break;
         case Type::UNKNOWN:
         case Type::END:

From 1b8ae25153a06a630e8f3553d30494a497b9c449 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Thu, 29 Feb 2024 18:16:05 +0100
Subject: [PATCH 22/66] Use cancel instead of finish in case of exception

---
 src/Processors/Executors/PipelineExecutor.cpp | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp
index 8477e011763..5b5880759e6 100644
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@@ -391,7 +391,9 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control)
     SCOPE_EXIT_SAFE(
         if (!finished_flag)
         {
-            finish();
+            /// If finished_flag is not set, there was an exception.
+            /// Cancel execution in this case.
+            cancel();
             if (pool)
                 pool->wait();
         }
@@ -399,18 +401,7 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control)
 
     if (num_threads > 1)
     {
-        try
-        {
-            spawnThreads(); // start at least one thread
-        }
-        catch (...)
-        {
-            /// spawnThreads can throw an exception, for example CANNOT_SCHEDULE_TASK.
-            /// We should cancel execution properly before rethrow.
-            cancel();
-            throw;
-        }
-
+        spawnThreads(); // start at least one thread
         tasks.processAsyncTasks();
         pool->wait();
     }

From 0b10612c863bf5b62bcf90028daa57275a966b6a Mon Sep 17 00:00:00 2001
From: yariks5s <yaroslav.briukhovetskyi@clickhouse.com>
Date: Fri, 1 Mar 2024 17:42:57 +0000
Subject: [PATCH 23/66] fix

---
 src/Functions/FunctionBinaryArithmetic.h      | 23 ++++++++++---------
 src/Functions/IsOperation.h                   |  6 ++---
 ...02_int_div_decimal_with_date_bug.reference |  0
 .../03002_int_div_decimal_with_date_bug.sql   |  5 ++++
 4 files changed, 20 insertions(+), 14 deletions(-)
 create mode 100644 tests/queries/0_stateless/03002_int_div_decimal_with_date_bug.reference
 create mode 100644 tests/queries/0_stateless/03002_int_div_decimal_with_date_bug.sql

diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h
index d253095ca01..9ad74f6332f 100644
--- a/src/Functions/FunctionBinaryArithmetic.h
+++ b/src/Functions/FunctionBinaryArithmetic.h
@@ -170,7 +170,8 @@ public:
     /// DateTime, but if both operands are Dates, their type must be the same (e.g. Date - DateTime is invalid).
     using ResultDataType = Switch<
         /// Result must be Integer
-        Case<IsOperation<Operation>::div_int || IsOperation<Operation>::div_int_or_zero, DataTypeFromFieldType<typename Op::ResultType>>,
+        Case<IsOperation<Operation>::int_div || IsOperation<Operation>::int_div_or_zero,
+            std::conditional_t<IsDataTypeNumber<LeftDataType> && IsDataTypeNumber<RightDataType>, DataTypeFromFieldType<typename Op::ResultType>, InvalidType>>,
         /// Decimal cases
         Case<IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>, DecimalResultDataType>,
         Case<
@@ -672,8 +673,8 @@ private:
                                             IsOperation<Operation>::minus;
     static constexpr bool is_multiply =     IsOperation<Operation>::multiply;
     static constexpr bool is_float_division = IsOperation<Operation>::div_floating;
-    static constexpr bool is_int_division = IsOperation<Operation>::div_int ||
-                                            IsOperation<Operation>::div_int_or_zero;
+    static constexpr bool is_int_division = IsOperation<Operation>::int_div ||
+                                            IsOperation<Operation>::int_div_or_zero;
     static constexpr bool is_division = is_float_division || is_int_division;
     static constexpr bool is_compare =      IsOperation<Operation>::least ||
                                             IsOperation<Operation>::greatest;
@@ -781,8 +782,8 @@ class FunctionBinaryArithmetic : public IFunction
     static constexpr bool is_division = IsOperation<Op>::division;
     static constexpr bool is_bit_hamming_distance = IsOperation<Op>::bit_hamming_distance;
     static constexpr bool is_modulo = IsOperation<Op>::modulo;
-    static constexpr bool is_div_int = IsOperation<Op>::div_int;
-    static constexpr bool is_div_int_or_zero = IsOperation<Op>::div_int_or_zero;
+    static constexpr bool is_int_div = IsOperation<Op>::int_div;
+    static constexpr bool is_int_div_or_zero = IsOperation<Op>::int_div_or_zero;
 
     ContextPtr context;
     bool check_decimal_overflow = true;
@@ -1007,11 +1008,11 @@ class FunctionBinaryArithmetic : public IFunction
             {
                 function_name = "tupleModuloByNumber";
             }
-            else if constexpr (is_div_int)
+            else if constexpr (is_int_div)
             {
                 function_name = "tupleIntDivByNumber";
             }
-            else if constexpr (is_div_int_or_zero)
+            else if constexpr (is_int_div_or_zero)
             {
                 function_name = "tupleIntDivOrZeroByNumber";
             }
@@ -1466,7 +1467,7 @@ public:
 
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & arguments) const override
     {
-        return ((IsOperation<Op>::div_int || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo) && !arguments[1].is_const)
+        return ((IsOperation<Op>::int_div || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo) && !arguments[1].is_const)
             || (IsOperation<Op>::div_floating
                 && (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type)));
     }
@@ -1690,7 +1691,7 @@ public:
 
                 if constexpr (!std::is_same_v<ResultDataType, InvalidType>)
                 {
-                    if constexpr (is_div_int || is_div_int_or_zero)
+                    if constexpr (is_int_div || is_int_div_or_zero)
                         type_res = std::make_shared<ResultDataType>();
                     else if constexpr (IsDataTypeDecimal<LeftDataType> && IsDataTypeDecimal<RightDataType>)
                     {
@@ -2086,7 +2087,7 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A
                     right_nullmap);
             }
             /// Here we check if we have `intDiv` or `intDivOrZero` and at least one of the arguments is decimal, because in this case originally we had result as decimal, so we need to convert result into integer after calculations
-            else if constexpr (!decimal_with_float && (is_div_int || is_div_int_or_zero) && (IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>))
+            else if constexpr (!decimal_with_float && (is_int_div || is_int_div_or_zero) && (IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>))
             {
 
                 if constexpr (!std::is_same_v<DecimalResultType, InvalidType>)
@@ -2624,7 +2625,7 @@ public:
         /// Check the case when operation is divide, intDiv or modulo and denominator is Nullable(Something).
         /// For divide operation we should check only Nullable(Decimal), because only this case can throw division by zero error.
         bool division_by_nullable = !arguments[0].type->onlyNull() && !arguments[1].type->onlyNull() && arguments[1].type->isNullable()
-            && (IsOperation<Op>::div_int || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo
+            && (IsOperation<Op>::int_div || IsOperation<Op>::modulo || IsOperation<Op>::positive_modulo
                 || (IsOperation<Op>::div_floating
                     && (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type))));
 
diff --git a/src/Functions/IsOperation.h b/src/Functions/IsOperation.h
index b2c7a27d375..a74df8f4dd9 100644
--- a/src/Functions/IsOperation.h
+++ b/src/Functions/IsOperation.h
@@ -51,8 +51,8 @@ struct IsOperation
     static constexpr bool minus = IsSameOperation<Op, MinusImpl>::value;
     static constexpr bool multiply = IsSameOperation<Op, MultiplyImpl>::value;
     static constexpr bool div_floating = IsSameOperation<Op, DivideFloatingImpl>::value;
-    static constexpr bool div_int = IsSameOperation<Op, DivideIntegralImpl>::value;
-    static constexpr bool div_int_or_zero = IsSameOperation<Op, DivideIntegralOrZeroImpl>::value;
+    static constexpr bool int_div = IsSameOperation<Op, DivideIntegralImpl>::value;
+    static constexpr bool int_div_or_zero = IsSameOperation<Op, DivideIntegralOrZeroImpl>::value;
     static constexpr bool modulo = IsSameOperation<Op, ModuloImpl>::value;
     static constexpr bool positive_modulo = IsSameOperation<Op, PositiveModuloImpl>::value;
     static constexpr bool least = IsSameOperation<Op, LeastBaseImpl>::value;
@@ -60,7 +60,7 @@ struct IsOperation
 
     static constexpr bool bit_hamming_distance = IsSameOperation<Op, BitHammingDistanceImpl>::value;
 
-    static constexpr bool division = div_floating || div_int || div_int_or_zero || modulo;
+    static constexpr bool division = div_floating || int_div || int_div_or_zero || modulo;
     // NOTE: allow_decimal should not fully contain `division` because of divInt
     static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest;
 };
diff --git a/tests/queries/0_stateless/03002_int_div_decimal_with_date_bug.reference b/tests/queries/0_stateless/03002_int_div_decimal_with_date_bug.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/03002_int_div_decimal_with_date_bug.sql b/tests/queries/0_stateless/03002_int_div_decimal_with_date_bug.sql
new file mode 100644
index 00000000000..1668821200c
--- /dev/null
+++ b/tests/queries/0_stateless/03002_int_div_decimal_with_date_bug.sql
@@ -0,0 +1,5 @@
+SELECT intDiv(CAST('1.0', 'Decimal256(3)'), today()); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT intDiv(CAST('1.0', 'Decimal256(3)'), toDate('2023-01-02')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT intDiv(CAST('1.0', 'Decimal256(2)'), toDate32('2023-01-02 12:12:12')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT intDiv(CAST('1.0', 'Decimal256(2)'), toDateTime('2023-01-02 12:12:12')); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT intDiv(CAST('1.0', 'Decimal256(2)'), toDateTime64('2023-01-02 12:12:12.002', 3)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }

From 57482de0f0858f5a03e60a7310227106487ce438 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Sat, 2 Mar 2024 01:16:47 +0100
Subject: [PATCH 24/66] Update FunctionBinaryArithmetic.h

---
 src/Functions/FunctionBinaryArithmetic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h
index 9ad74f6332f..79e5ee442c2 100644
--- a/src/Functions/FunctionBinaryArithmetic.h
+++ b/src/Functions/FunctionBinaryArithmetic.h
@@ -171,7 +171,7 @@ public:
     using ResultDataType = Switch<
         /// Result must be Integer
         Case<IsOperation<Operation>::int_div || IsOperation<Operation>::int_div_or_zero,
-            std::conditional_t<IsDataTypeNumber<LeftDataType> && IsDataTypeNumber<RightDataType>, DataTypeFromFieldType<typename Op::ResultType>, InvalidType>>,
+            std::conditional_t<IsDataTypeDecimalOrNumber<LeftDataType> && IsDataTypeDecimalOrNumber<RightDataType>, DataTypeFromFieldType<typename Op::ResultType>, InvalidType>>,
         /// Decimal cases
         Case<IsDataTypeDecimal<LeftDataType> || IsDataTypeDecimal<RightDataType>, DecimalResultDataType>,
         Case<

From adeccecba93fcdfb81d0761c0f15b39d9bf2b471 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 05:40:19 +0100
Subject: [PATCH 25/66] Fix build

---
 src/Common/SystemLogBase.cpp | 6 ++++++
 src/Common/SystemLogBase.h   | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp
index aef4e19a70c..4dee6d905d9 100644
--- a/src/Common/SystemLogBase.cpp
+++ b/src/Common/SystemLogBase.cpp
@@ -260,4 +260,10 @@ void SystemLogBase<LogElement>::add(LogElement element)
 template <typename LogElement>
 void SystemLogBase<LogElement>::notifyFlush(bool force) { queue->notifyFlush(force); }
 
+#define INSTANTIATE_SYSTEM_LOG_BASE(ELEMENT) template class SystemLogBase<ELEMENT>;
+SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG_BASE)
+
+#define INSTANTIATE_SYSTEM_LOG_QUEUE(ELEMENT) template class SystemLogQueue<ELEMENT>;
+SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG_QUEUE)
+
 }
diff --git a/src/Common/SystemLogBase.h b/src/Common/SystemLogBase.h
index b4d6f2e98bb..95906c63349 100644
--- a/src/Common/SystemLogBase.h
+++ b/src/Common/SystemLogBase.h
@@ -29,7 +29,6 @@
     M(TextLogElement) \
     M(S3QueueLogElement) \
     M(FilesystemCacheLogElement) \
-    M(DistributedCacheLogElement) \
     M(FilesystemReadPrefetchesLogElement) \
     M(AsynchronousInsertLogElement) \
     M(BackupLogElement) \

From df498107c91cabb20e0e66db1b8b8fa122b0a842 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 17:29:09 +0100
Subject: [PATCH 26/66] Fix test

---
 tests/integration/test_grant_and_revoke/test.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/integration/test_grant_and_revoke/test.py b/tests/integration/test_grant_and_revoke/test.py
index 75a59ceac39..46d8d254a0a 100644
--- a/tests/integration/test_grant_and_revoke/test.py
+++ b/tests/integration/test_grant_and_revoke/test.py
@@ -186,10 +186,7 @@ def test_grant_all_on_table():
     instance.query("GRANT ALL ON test.table TO B", user="A")
     assert (
         instance.query("SHOW GRANTS FOR B")
-        == "GRANT SHOW TABLES, SHOW COLUMNS, SHOW DICTIONARIES, SELECT, INSERT, ALTER TABLE, ALTER VIEW, CREATE TABLE, CREATE VIEW, CREATE DICTIONARY, "
-        "DROP TABLE, DROP VIEW, DROP DICTIONARY, UNDROP TABLE, TRUNCATE, OPTIMIZE, BACKUP, CREATE ROW POLICY, ALTER ROW POLICY, DROP ROW POLICY, SHOW ROW POLICIES, "
-        "SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, SYSTEM MOVES, SYSTEM PULLING REPLICATION LOG, SYSTEM CLEANUP, SYSTEM VIEWS, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, "
-        "SYSTEM RESTART REPLICA, SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM FLUSH DISTRIBUTED, dictGet ON test.`table` TO B\n"
+        == "GRANT SHOW TABLES, SHOW COLUMNS, SHOW DICTIONARIES, SELECT, INSERT, ALTER TABLE, ALTER VIEW, CREATE TABLE, CREATE VIEW, CREATE DICTIONARY, DROP TABLE, DROP VIEW, DROP DICTIONARY, UNDROP TABLE, TRUNCATE, OPTIMIZE, BACKUP, CREATE ROW POLICY, ALTER ROW POLICY, DROP ROW POLICY, SHOW ROW POLICIES, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, SYSTEM MOVES, SYSTEM PULLING REPLICATION LOG, SYSTEM CLEANUP, SYSTEM VIEWS, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, SYSTEM VIRTUAL PARTS UPDATE, SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM FLUSH DISTRIBUTED, dictGet ON test.`table` TO B\n"
     )
     instance.query("REVOKE ALL ON test.table FROM B", user="A")
     assert instance.query("SHOW GRANTS FOR B") == ""

From 6f3bad904b0d2f900bc8cb08a23e6a00027968fc Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 20:42:44 +0100
Subject: [PATCH 27/66] Mark one setting as obsolete

---
 src/Core/Settings.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index ae6ea165cc9..348e38cf269 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -875,7 +875,6 @@ class IColumn;
     M(SQLSecurityType, default_normal_view_sql_security, SQLSecurityType::INVOKER, "Allows to set a default value for SQL SECURITY option when creating a normal view.", 0) \
     M(SQLSecurityType, default_materialized_view_sql_security, SQLSecurityType::DEFINER, "Allows to set a default value for SQL SECURITY option when creating a materialized view.", 0) \
     M(String, default_view_definer, "CURRENT_USER", "Allows to set a default value for DEFINER option when creating view.", 0) \
-    M(Bool, allow_experimental_shared_merge_tree, false, "Only available in ClickHouse Cloud", 0) \
     M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud", 0) \
     M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \
     M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \
@@ -902,6 +901,7 @@ class IColumn;
     MAKE_OBSOLETE(M, Bool, allow_experimental_geo_types, true) \
     MAKE_OBSOLETE(M, Bool, allow_experimental_query_cache, true) \
     MAKE_OBSOLETE(M, Bool, allow_experimental_alter_materialized_view_structure, true) \
+    MAKE_OBSOLETE(M, Bool, allow_experimental_shared_merge_tree, true) \
     \
     MAKE_OBSOLETE(M, Milliseconds, async_insert_stale_timeout_ms, 0) \
     MAKE_OBSOLETE(M, StreamingHandleErrorMode, handle_kafka_error_mode, StreamingHandleErrorMode::DEFAULT) \

From e6fd4658f47a4b4edf07a200d502a0acbd608821 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 20:44:23 +0100
Subject: [PATCH 28/66] Sync documentation

---
 src/Core/Settings.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 348e38cf269..2e5e1db78ed 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -875,9 +875,9 @@ class IColumn;
     M(SQLSecurityType, default_normal_view_sql_security, SQLSecurityType::INVOKER, "Allows to set a default value for SQL SECURITY option when creating a normal view.", 0) \
     M(SQLSecurityType, default_materialized_view_sql_security, SQLSecurityType::DEFINER, "Allows to set a default value for SQL SECURITY option when creating a materialized view.", 0) \
     M(String, default_view_definer, "CURRENT_USER", "Allows to set a default value for DEFINER option when creating view.", 0) \
-    M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud", 0) \
-    M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \
-    M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \
+    M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud. Number of background threads for speculatively downloading new data parts into file cache, when cache_populated_by_fetch is enabled. Zero to disable.", 0) \
+    M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud. Exclude new data parts from SELECT queries until they're either pre-warmed (see cache_populated_by_fetch) or this many seconds old. Only for Replicated-/SharedMergeTree.", 0) \
+    M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud. If a merged part is less than this many seconds old and is not pre-warmed (see cache_populated_by_fetch), but all its source parts are available and pre-warmed, SELECT queries will read from those parts instead. Only for ReplicatedMergeTree. Note that this only checks whether CacheWarmer processed the part; if the part was fetched into cache by something else, it'll still be considered cold until CacheWarmer gets to it; if it was warmed, then evicted from cache, it'll still be considered warm.", 0) \
     M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \
 
 // End of COMMON_SETTINGS

From 5c3262832be297a15c53506ef088f9c80b7bf0d9 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 21:38:49 +0100
Subject: [PATCH 29/66] Fix a test with Analyzer

---
 tests/analyzer_tech_debt.txt                         |  1 -
 .../02493_inconsistent_hex_and_binary_number.expect  | 12 ++++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt
index 0672d3085fe..f1093fa07db 100644
--- a/tests/analyzer_tech_debt.txt
+++ b/tests/analyzer_tech_debt.txt
@@ -13,7 +13,6 @@
 01952_optimize_distributed_group_by_sharding_key
 02174_cte_scalar_cache_mv
 02354_annoy
-02493_inconsistent_hex_and_binary_number
 # Check after constants refactoring
 02901_parallel_replicas_rollup
 # Flaky. Please don't delete them without fixing them:
diff --git a/tests/queries/0_stateless/02493_inconsistent_hex_and_binary_number.expect b/tests/queries/0_stateless/02493_inconsistent_hex_and_binary_number.expect
index 2d595b0f492..1cc11f9bf9f 100755
--- a/tests/queries/0_stateless/02493_inconsistent_hex_and_binary_number.expect
+++ b/tests/queries/0_stateless/02493_inconsistent_hex_and_binary_number.expect
@@ -18,23 +18,23 @@ spawn bash
 send "source $basedir/../shell_config.sh\r"
 
 send "\$CLICKHOUSE_CLIENT --query 'select 0b'\r"
-expect "DB::Exception: Missing columns: '0b' while processing query: 'SELECT `0b`', required columns: '0b'. (UNKNOWN_IDENTIFIER)"
+expect "(UNKNOWN_IDENTIFIER)"
 
 send "\$CLICKHOUSE_CLIENT --query 'select 0b;'\r"
-expect "DB::Exception: Missing columns: '0b' while processing query: 'SELECT `0b`', required columns: '0b'. (UNKNOWN_IDENTIFIER)"
+expect "(UNKNOWN_IDENTIFIER)"
 
 send "\$CLICKHOUSE_CLIENT --query 'select 0b ;'\r"
-expect "DB::Exception: Missing columns: '0b' while processing query: 'SELECT `0b`', required columns: '0b'. (UNKNOWN_IDENTIFIER)"
+expect "(UNKNOWN_IDENTIFIER)"
 
 
 send "\$CLICKHOUSE_CLIENT --query 'select 0x'\r"
-expect "DB::Exception: Missing columns: '0x' while processing query: 'SELECT `0x`', required columns: '0x'. (UNKNOWN_IDENTIFIER)"
+expect "(UNKNOWN_IDENTIFIER)"
 
 send "\$CLICKHOUSE_CLIENT --query 'select 0x;'\r"
-expect "DB::Exception: Missing columns: '0x' while processing query: 'SELECT `0x`', required columns: '0x'. (UNKNOWN_IDENTIFIER)"
+expect "(UNKNOWN_IDENTIFIER)"
 
 send "\$CLICKHOUSE_CLIENT --query 'select 0x ;'\r"
-expect "DB::Exception: Missing columns: '0x' while processing query: 'SELECT `0x`', required columns: '0x'. (UNKNOWN_IDENTIFIER)"
+expect "(UNKNOWN_IDENTIFIER)"
 
 send "exit\r"
 expect eof

From 772cf60de18319cad57992e1ace391f706b5ddb9 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 21:42:04 +0100
Subject: [PATCH 30/66] Edit SettingsChangesHistory

---
 src/Core/SettingsChangesHistory.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index face1def4b4..f195ef487ab 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -85,6 +85,7 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
+    {"24.3", {{"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"}}},
     {"24.2", {
               {"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
               {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},

From b41b935a6a9d386c259e71d5ae5be530076fb89d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 22:05:48 +0100
Subject: [PATCH 31/66] Two tests are fixed

---
 tests/analyzer_integration_broken_tests.txt | 1 -
 tests/analyzer_tech_debt.txt                | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/analyzer_integration_broken_tests.txt b/tests/analyzer_integration_broken_tests.txt
index 796ca6bca22..e819e134706 100644
--- a/tests/analyzer_integration_broken_tests.txt
+++ b/tests/analyzer_integration_broken_tests.txt
@@ -3,5 +3,4 @@ test_concurrent_backups_s3/test.py::test_concurrent_backups
 test_distributed_type_object/test.py::test_distributed_type_object
 test_merge_table_over_distributed/test.py::test_global_in
 test_merge_table_over_distributed/test.py::test_select_table_name_from_merge_over_distributed
-test_passing_max_partitions_to_read_remotely/test.py::test_default_database_on_cluster
 test_select_access_rights/test_main.py::test_alias_columns
diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt
index 0672d3085fe..9328504cd7c 100644
--- a/tests/analyzer_tech_debt.txt
+++ b/tests/analyzer_tech_debt.txt
@@ -20,4 +20,3 @@
 01287_max_execution_speed
 02003_WithMergeableStateAfterAggregationAndLimit_LIMIT_BY_LIMIT_OFFSET
 02404_memory_bound_merging
-02479_race_condition_between_insert_and_droppin_mv

From fe50f5ddf64f4c9902183854365f247a3b72edd0 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 3 Mar 2024 22:49:54 +0100
Subject: [PATCH 32/66] Make String a supertype for strings and enums

---
 src/DataTypes/getLeastSupertype.cpp                    | 10 ++++++----
 .../03003_enum_and_string_compatible.reference         |  1 +
 .../0_stateless/03003_enum_and_string_compatible.sql   |  1 +
 3 files changed, 8 insertions(+), 4 deletions(-)
 create mode 100644 tests/queries/0_stateless/03003_enum_and_string_compatible.reference
 create mode 100644 tests/queries/0_stateless/03003_enum_and_string_compatible.sql

diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp
index d67d5eb24e0..dec77119eed 100644
--- a/src/DataTypes/getLeastSupertype.cpp
+++ b/src/DataTypes/getLeastSupertype.cpp
@@ -474,16 +474,18 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         type_ids.insert(type->getTypeId());
 
     /// For String and FixedString, or for different FixedStrings, the common type is String.
-    /// No other types are compatible with Strings. TODO Enums?
+    /// If there are Enums and any type of Strings, the common type is String.
+    /// No other types are compatible with Strings.
     {
         size_t have_string = type_ids.count(TypeIndex::String);
         size_t have_fixed_string = type_ids.count(TypeIndex::FixedString);
+        size_t have_enums = type_ids.count(TypeIndex::Enum8) + type_ids.count(TypeIndex::Enum16);
 
         if (have_string || have_fixed_string)
         {
-            bool all_strings = type_ids.size() == (have_string + have_fixed_string);
-            if (!all_strings)
-                return throwOrReturn<on_error>(types, "because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE);
+            bool all_compatible_with_string = type_ids.size() == (have_string + have_fixed_string + have_enums);
+            if (!all_compatible_with_string)
+                return throwOrReturn<on_error>(types, "because some of them are String/FixedString/Enum and some of them are not", ErrorCodes::NO_COMMON_TYPE);
 
             return std::make_shared<DataTypeString>();
         }
diff --git a/tests/queries/0_stateless/03003_enum_and_string_compatible.reference b/tests/queries/0_stateless/03003_enum_and_string_compatible.reference
new file mode 100644
index 00000000000..acf5fe0d423
--- /dev/null
+++ b/tests/queries/0_stateless/03003_enum_and_string_compatible.reference
@@ -0,0 +1 @@
+['Hello','Goodbye','test']
diff --git a/tests/queries/0_stateless/03003_enum_and_string_compatible.sql b/tests/queries/0_stateless/03003_enum_and_string_compatible.sql
new file mode 100644
index 00000000000..0abba6741ac
--- /dev/null
+++ b/tests/queries/0_stateless/03003_enum_and_string_compatible.sql
@@ -0,0 +1 @@
+WITH 'Hello'::Enum8('Hello', 'World') AS enum1, 'test'::Enum8('test', 'best') AS enum2 SELECT [enum1, 'Goodbye', enum2];

From cbf5443585b82519310a45a7b4ad2f03873e796f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 00:11:55 +0100
Subject: [PATCH 33/66] Remove old code

---
 base/base/Decimal.h                           |  9 +--
 base/base/JSON.cpp                            | 10 +--
 base/base/JSON.h                              | 10 +--
 base/base/coverage.cpp                        |  9 ---
 base/base/defines.h                           | 80 ++++++-------------
 base/base/iostream_debug_helpers.h            |  2 -
 base/base/phdr_cache.cpp                      | 10 +--
 base/glibc-compatibility/musl/getauxval.c     |  6 +-
 base/harmful/harmful.c                        |  6 +-
 programs/benchmark/Benchmark.cpp              |  4 -
 programs/client/Client.cpp                    |  4 -
 src/Columns/Collator.cpp                      |  6 +-
 src/Common/FailPoint.h                        |  4 -
 src/Common/MatchGenerator.cpp                 | 20 ++---
 src/Common/MemorySanitizer.h                  |  4 -
 src/Common/NetlinkMetricsProvider.cpp         |  6 +-
 src/Common/StackTrace.h                       |  4 +-
 src/Common/TargetSpecific.h                   | 40 ----------
 .../integer_hash_tables_and_hashes.cpp        | 10 +--
 src/Common/intExp.h                           | 10 +--
 src/Common/re2.h                              | 11 +--
 src/Common/tests/gtest_DateLUTImpl.cpp        |  4 +-
 src/Compression/LZ4_decompress_faster.cpp     |  8 --
 src/Formats/MarkInCompressedFile.h            | 10 +--
 src/Functions/FunctionsHashing.h              | 10 +--
 src/Functions/idna.cpp                        | 17 ++--
 src/Functions/punycode.cpp                    |  4 -
 src/Functions/s2_fwd.h                        |  4 -
 src/Functions/seriesDecomposeSTL.cpp          |  6 --
 src/Functions/seriesPeriodDetectFFT.cpp       | 14 ++--
 src/IO/Archives/ArchiveUtils.h                |  2 -
 src/IO/DoubleConverter.h                      |  4 -
 src/IO/WriteHelpers.h                         |  4 -
 src/IO/readFloatText.h                        |  4 -
 src/Interpreters/examples/hash_map_string.cpp |  4 +-
 src/Parsers/ExpressionListParsers.h           |  4 -
 .../MergeTree/MergeTreeDataPartType.h         | 10 +--
 src/Storages/StorageS3.cpp                    | 10 +--
 38 files changed, 80 insertions(+), 304 deletions(-)

diff --git a/base/base/Decimal.h b/base/base/Decimal.h
index afa186faf5b..66ff623217c 100644
--- a/base/base/Decimal.h
+++ b/base/base/Decimal.h
@@ -1,14 +1,9 @@
 #pragma once
+
 #include <base/extended_types.h>
 #include <base/Decimal_fwd.h>
+#include <base/defines.h>
 
-#if !defined(NO_SANITIZE_UNDEFINED)
-#if defined(__clang__)
-    #define NO_SANITIZE_UNDEFINED __attribute__((__no_sanitize__("undefined")))
-#else
-    #define NO_SANITIZE_UNDEFINED
-#endif
-#endif
 
 namespace DB
 {
diff --git a/base/base/JSON.cpp b/base/base/JSON.cpp
index 0b43be38149..9da059c98b6 100644
--- a/base/base/JSON.cpp
+++ b/base/base/JSON.cpp
@@ -10,14 +10,10 @@
 #define JSON_MAX_DEPTH 100
 
 
-#ifdef __clang__
-#  pragma clang diagnostic push
-#  pragma clang diagnostic ignored "-Wdeprecated-dynamic-exception-spec"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-dynamic-exception-spec"
 POCO_IMPLEMENT_EXCEPTION(JSONException, Poco::Exception, "JSONException") // NOLINT(cert-err60-cpp, modernize-use-noexcept, hicpp-use-noexcept)
-#ifdef __clang__
-#  pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
 
 
 /// Read unsigned integer in a simple form from a non-0-terminated string.
diff --git a/base/base/JSON.h b/base/base/JSON.h
index 850b74715c6..bc053670a96 100644
--- a/base/base/JSON.h
+++ b/base/base/JSON.h
@@ -39,14 +39,10 @@
 
 
 // NOLINTBEGIN(google-explicit-constructor)
-#ifdef __clang__
-#  pragma clang diagnostic push
-#  pragma clang diagnostic ignored "-Wdeprecated-dynamic-exception-spec"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-dynamic-exception-spec"
 POCO_DECLARE_EXCEPTION(Foundation_API, JSONException, Poco::Exception)
-#ifdef __clang__
-#  pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
 // NOLINTEND(google-explicit-constructor)
 
 class JSON
diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp
index b85f1a16d32..99b897c4571 100644
--- a/base/base/coverage.cpp
+++ b/base/base/coverage.cpp
@@ -13,11 +13,7 @@
 #include <unistd.h>
 
 
-#    if defined(__clang__)
 extern "C" void __llvm_profile_dump(); // NOLINT
-#    elif defined(__GNUC__) || defined(__GNUG__)
-extern "C" void __gcov_exit();
-#    endif
 
 #endif
 
@@ -28,12 +24,7 @@ void dumpCoverageReportIfPossible()
     static std::mutex mutex;
     std::lock_guard lock(mutex);
 
-#    if defined(__clang__)
     __llvm_profile_dump(); // NOLINT
-#    elif defined(__GNUC__) || defined(__GNUG__)
-    __gcov_exit();
-#    endif
-
 #endif
 }
 
diff --git a/base/base/defines.h b/base/base/defines.h
index 02058a29096..1f02748633d 100644
--- a/base/base/defines.h
+++ b/base/base/defines.h
@@ -11,7 +11,7 @@
 ///   including <base/defines.h>
 /// - it should not have fallback to 0,
 ///   since this may create false-positive detection (common problem)
-#if defined(__clang__) && defined(__has_feature)
+#if defined(__has_feature)
 #    define ch_has_feature __has_feature
 #endif
 
@@ -76,24 +76,11 @@
 /// Explicitly allow undefined behaviour for certain functions. Use it as a function attribute.
 /// It is useful in case when compiler cannot see (and exploit) it, but UBSan can.
 /// Example: multiplication of signed integers with possibility of overflow when both sides are from user input.
-#if defined(__clang__)
-#    define NO_SANITIZE_UNDEFINED __attribute__((__no_sanitize__("undefined")))
-#    define NO_SANITIZE_ADDRESS __attribute__((__no_sanitize__("address")))
-#    define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread")))
-#    define ALWAYS_INLINE_NO_SANITIZE_UNDEFINED __attribute__((__always_inline__, __no_sanitize__("undefined")))
-#else  /// It does not work in GCC. GCC 7 cannot recognize this attribute and GCC 8 simply ignores it.
-#    define NO_SANITIZE_UNDEFINED
-#    define NO_SANITIZE_ADDRESS
-#    define NO_SANITIZE_THREAD
-#    define ALWAYS_INLINE_NO_SANITIZE_UNDEFINED ALWAYS_INLINE
-#endif
-
-#if defined(__clang__) && defined(__clang_major__) && __clang_major__ >= 14
-#    define DISABLE_SANITIZER_INSTRUMENTATION __attribute__((disable_sanitizer_instrumentation))
-#else
-#    define DISABLE_SANITIZER_INSTRUMENTATION
-#endif
-
+#define NO_SANITIZE_UNDEFINED __attribute__((__no_sanitize__("undefined")))
+#define NO_SANITIZE_ADDRESS __attribute__((__no_sanitize__("address")))
+#define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread")))
+#define ALWAYS_INLINE_NO_SANITIZE_UNDEFINED __attribute__((__always_inline__, __no_sanitize__("undefined")))
+#define DISABLE_SANITIZER_INSTRUMENTATION __attribute__((disable_sanitizer_instrumentation))
 
 #if !__has_include(<sanitizer/asan_interface.h>) || !defined(ADDRESS_SANITIZER)
 #   define ASAN_UNPOISON_MEMORY_REGION(a, b)
@@ -135,54 +122,33 @@
 
 /// Macros for Clang Thread Safety Analysis (TSA). They can be safely ignored by other compilers.
 /// Feel free to extend, but please stay close to https://clang.llvm.org/docs/ThreadSafetyAnalysis.html#mutexheader
-#if defined(__clang__)
-#    define TSA_GUARDED_BY(...) __attribute__((guarded_by(__VA_ARGS__)))                       /// data is protected by given capability
-#    define TSA_PT_GUARDED_BY(...) __attribute__((pt_guarded_by(__VA_ARGS__)))                 /// pointed-to data is protected by the given capability
-#    define TSA_REQUIRES(...) __attribute__((requires_capability(__VA_ARGS__)))                /// thread needs exclusive possession of given capability
-#    define TSA_REQUIRES_SHARED(...) __attribute__((requires_shared_capability(__VA_ARGS__)))  /// thread needs shared possession of given capability
-#    define TSA_ACQUIRED_AFTER(...) __attribute__((acquired_after(__VA_ARGS__)))               /// annotated lock must be locked after given lock
-#    define TSA_NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))           /// disable TSA for a function
-#    define TSA_CAPABILITY(...) __attribute__((capability(__VA_ARGS__)))                       /// object of a class can be used as capability
-#    define TSA_ACQUIRE(...) __attribute__((acquire_capability(__VA_ARGS__)))                        /// function acquires a capability, but does not release it
-#    define TSA_TRY_ACQUIRE(...) __attribute__((try_acquire_capability(__VA_ARGS__)))                /// function tries to acquire a capability and returns a boolean value indicating success or failure
-#    define TSA_RELEASE(...) __attribute__((release_capability(__VA_ARGS__)))                        /// function releases the given capability
-#    define TSA_ACQUIRE_SHARED(...) __attribute__((acquire_shared_capability(__VA_ARGS__)))          /// function acquires a shared capability, but does not release it
-#    define TSA_TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__)))  /// function tries to acquire a shared capability and returns a boolean value indicating success or failure
-#    define TSA_RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__)))          /// function releases the given shared capability
-#    define TSA_SCOPED_LOCKABLE __attribute__((scoped_lockable)) /// object of a class has scoped lockable capability
+#define TSA_GUARDED_BY(...) __attribute__((guarded_by(__VA_ARGS__)))                       /// data is protected by given capability
+#define TSA_PT_GUARDED_BY(...) __attribute__((pt_guarded_by(__VA_ARGS__)))                 /// pointed-to data is protected by the given capability
+#define TSA_REQUIRES(...) __attribute__((requires_capability(__VA_ARGS__)))                /// thread needs exclusive possession of given capability
+#define TSA_REQUIRES_SHARED(...) __attribute__((requires_shared_capability(__VA_ARGS__)))  /// thread needs shared possession of given capability
+#define TSA_ACQUIRED_AFTER(...) __attribute__((acquired_after(__VA_ARGS__)))               /// annotated lock must be locked after given lock
+#define TSA_NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))           /// disable TSA for a function
+#define TSA_CAPABILITY(...) __attribute__((capability(__VA_ARGS__)))                       /// object of a class can be used as capability
+#define TSA_ACQUIRE(...) __attribute__((acquire_capability(__VA_ARGS__)))                        /// function acquires a capability, but does not release it
+#define TSA_TRY_ACQUIRE(...) __attribute__((try_acquire_capability(__VA_ARGS__)))                /// function tries to acquire a capability and returns a boolean value indicating success or failure
+#define TSA_RELEASE(...) __attribute__((release_capability(__VA_ARGS__)))                        /// function releases the given capability
+#define TSA_ACQUIRE_SHARED(...) __attribute__((acquire_shared_capability(__VA_ARGS__)))          /// function acquires a shared capability, but does not release it
+#define TSA_TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__)))  /// function tries to acquire a shared capability and returns a boolean value indicating success or failure
+#define TSA_RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__)))          /// function releases the given shared capability
+#define TSA_SCOPED_LOCKABLE __attribute__((scoped_lockable)) /// object of a class has scoped lockable capability
 
 /// Macros for suppressing TSA warnings for specific reads/writes (instead of suppressing it for the whole function)
 /// They use a lambda function to apply function attribute to a single statement. This enable us to suppress warnings locally instead of
 /// suppressing them in the whole function
 /// Consider adding a comment when using these macros.
-#   define TSA_SUPPRESS_WARNING_FOR_READ(x) ([&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> const auto & { return (x); }())
-#   define TSA_SUPPRESS_WARNING_FOR_WRITE(x) ([&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> auto & { return (x); }())
+#define TSA_SUPPRESS_WARNING_FOR_READ(x) ([&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> const auto & { return (x); }())
+#define TSA_SUPPRESS_WARNING_FOR_WRITE(x) ([&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> auto & { return (x); }())
 
 /// This macro is useful when only one thread writes to a member
 /// and you want to read this member from the same thread without locking a mutex.
 /// It's safe (because no concurrent writes are possible), but TSA generates a warning.
 /// (Seems like there's no way to verify it, but it makes sense to distinguish it from TSA_SUPPRESS_WARNING_FOR_READ for readability)
-#   define TSA_READ_ONE_THREAD(x) TSA_SUPPRESS_WARNING_FOR_READ(x)
-
-#else
-#    define TSA_GUARDED_BY(...)
-#    define TSA_PT_GUARDED_BY(...)
-#    define TSA_REQUIRES(...)
-#    define TSA_REQUIRES_SHARED(...)
-#    define TSA_NO_THREAD_SAFETY_ANALYSIS
-#    define TSA_CAPABILITY(...)
-#    define TSA_ACQUIRE(...)
-#    define TSA_TRY_ACQUIRE(...)
-#    define TSA_RELEASE(...)
-#    define TSA_ACQUIRE_SHARED(...)
-#    define TSA_TRY_ACQUIRE_SHARED(...)
-#    define TSA_RELEASE_SHARED(...)
-#    define TSA_SCOPED_LOCKABLE
-
-#    define TSA_SUPPRESS_WARNING_FOR_READ(x) (x)
-#    define TSA_SUPPRESS_WARNING_FOR_WRITE(x) (x)
-#    define TSA_READ_ONE_THREAD(x) TSA_SUPPRESS_WARNING_FOR_READ(x)
-#endif
+#define TSA_READ_ONE_THREAD(x) TSA_SUPPRESS_WARNING_FOR_READ(x)
 
 /// A template function for suppressing warnings about unused variables or function results.
 template <typename... Args>
diff --git a/base/base/iostream_debug_helpers.h b/base/base/iostream_debug_helpers.h
index f531a56031b..5c601251272 100644
--- a/base/base/iostream_debug_helpers.h
+++ b/base/base/iostream_debug_helpers.h
@@ -155,9 +155,7 @@ Out & dump(Out & out, const char * name, T && x) // NOLINT(cppcoreguidelines-mis
     return dumpValue(out, x) << "; ";
 }
 
-#ifdef __clang__
 #pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
-#endif
 
 #define DUMPVAR(VAR) ::dump(std::cerr, #VAR, (VAR));
 #define DUMPHEAD std::cerr << __FILE__ << ':' << __LINE__ << " [ " << getThreadId() << " ] ";
diff --git a/base/base/phdr_cache.cpp b/base/base/phdr_cache.cpp
index 7d37f01b560..802d1bf35f5 100644
--- a/base/base/phdr_cache.cpp
+++ b/base/base/phdr_cache.cpp
@@ -11,10 +11,8 @@
 /// Thread Sanitizer uses dl_iterate_phdr function on initialization and fails if we provide our own.
 #ifdef USE_PHDR_CACHE
 
-#if defined(__clang__)
-#   pragma clang diagnostic ignored "-Wreserved-id-macro"
-#   pragma clang diagnostic ignored "-Wunused-macros"
-#endif
+#pragma clang diagnostic ignored "-Wreserved-id-macro"
+#pragma clang diagnostic ignored "-Wunused-macros"
 
 #define __msan_unpoison(X, Y) // NOLINT
 #if defined(ch_has_feature)
@@ -57,10 +55,6 @@ std::atomic<PHDRCache *> phdr_cache {};
 
 
 extern "C"
-#ifndef __clang__
-[[gnu::visibility("default")]]
-[[gnu::externally_visible]]
-#endif
 int dl_iterate_phdr(int (*callback) (dl_phdr_info * info, size_t size, void * data), void * data)
 {
     auto * current_phdr_cache = phdr_cache.load();
diff --git a/base/glibc-compatibility/musl/getauxval.c b/base/glibc-compatibility/musl/getauxval.c
index 44a9f979f99..ea5cff9fc11 100644
--- a/base/glibc-compatibility/musl/getauxval.c
+++ b/base/glibc-compatibility/musl/getauxval.c
@@ -20,11 +20,7 @@
 
 /// Suppress TSan since it is possible for this code to be called from multiple threads,
 /// and initialization is safe to be done multiple times from multiple threads.
-#if defined(__clang__)
-#    define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread")))
-#else
-#    define NO_SANITIZE_THREAD
-#endif
+#define NO_SANITIZE_THREAD __attribute__((__no_sanitize__("thread")))
 
 // We don't have libc struct available here.
 // Compute aux vector manually (from /proc/self/auxv).
diff --git a/base/harmful/harmful.c b/base/harmful/harmful.c
index 78796ca0c05..54b552a84ea 100644
--- a/base/harmful/harmful.c
+++ b/base/harmful/harmful.c
@@ -6,11 +6,7 @@
 /// It is only enabled in debug build (its intended use is for CI checks).
 #if !defined(NDEBUG)
 
-#if defined(__clang__)
-    #pragma clang diagnostic ignored "-Wincompatible-library-redeclaration"
-#else
-    #pragma GCC diagnostic ignored "-Wbuiltin-declaration-mismatch"
-#endif
+#pragma clang diagnostic ignored "-Wincompatible-library-redeclaration"
 
 /// We cannot use libc headers here.
 long write(int, const void *, unsigned long);
diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp
index fac88c0621f..45dadfef774 100644
--- a/programs/benchmark/Benchmark.cpp
+++ b/programs/benchmark/Benchmark.cpp
@@ -567,10 +567,6 @@ public:
 }
 
 
-#ifndef __clang__
-#pragma GCC optimize("-fno-var-tracking-assignments")
-#endif
-
 int mainEntryClickHouseBenchmark(int argc, char ** argv)
 {
     using namespace DB;
diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp
index 649a64b9de4..a2bd6b6016a 100644
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@@ -51,10 +51,6 @@
 #include <AggregateFunctions/registerAggregateFunctions.h>
 #include <Formats/registerFormats.h>
 
-#ifndef __clang__
-#pragma GCC optimize("-fno-var-tracking-assignments")
-#endif
-
 namespace fs = std::filesystem;
 using namespace std::literals;
 
diff --git a/src/Columns/Collator.cpp b/src/Columns/Collator.cpp
index 434a30c0450..f6a3bb40d25 100644
--- a/src/Columns/Collator.cpp
+++ b/src/Columns/Collator.cpp
@@ -8,10 +8,8 @@
 #    include <unicode/ucol.h>
 #    include <unicode/unistr.h>
 #else
-#    if defined(__clang__)
-#        pragma clang diagnostic ignored "-Wunused-private-field"
-#        pragma clang diagnostic ignored "-Wmissing-noreturn"
-#    endif
+#    pragma clang diagnostic ignored "-Wunused-private-field"
+#    pragma clang diagnostic ignored "-Wmissing-noreturn"
 #endif
 
 #include <Common/Exception.h>
diff --git a/src/Common/FailPoint.h b/src/Common/FailPoint.h
index 613cfb15322..b3e1214d597 100644
--- a/src/Common/FailPoint.h
+++ b/src/Common/FailPoint.h
@@ -5,18 +5,14 @@
 #include <Core/Types.h>
 #include <Poco/Util/AbstractConfiguration.h>
 
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdocumentation"
 #pragma clang diagnostic ignored "-Wreserved-macro-identifier"
-#endif
 
 #include <fiu.h>
 #include <fiu-control.h>
 
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
 
 #include <unordered_map>
 
diff --git a/src/Common/MatchGenerator.cpp b/src/Common/MatchGenerator.cpp
index f047c21b470..9078a5d181f 100644
--- a/src/Common/MatchGenerator.cpp
+++ b/src/Common/MatchGenerator.cpp
@@ -1,18 +1,14 @@
-#ifdef __clang__
-#  pragma clang diagnostic push
-#  pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
-#  pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-#  pragma clang diagnostic ignored "-Wnested-anon-types"
-#  pragma clang diagnostic ignored "-Wunused-parameter"
-#  pragma clang diagnostic ignored "-Wshadow-field-in-constructor"
-#  pragma clang diagnostic ignored "-Wdtor-name"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wnested-anon-types"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wshadow-field-in-constructor"
+#pragma clang diagnostic ignored "-Wdtor-name"
 #include <re2/re2.h>
 #include <re2/regexp.h>
 #include <re2/walker-inl.h>
-#ifdef __clang__
-#  pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
 
 #ifdef LOG_INFO
 #undef LOG_INFO
diff --git a/src/Common/MemorySanitizer.h b/src/Common/MemorySanitizer.h
index bd44ff62acb..5d72e0b8f73 100644
--- a/src/Common/MemorySanitizer.h
+++ b/src/Common/MemorySanitizer.h
@@ -2,10 +2,8 @@
 
 #include <base/defines.h>
 
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreserved-id-macro"
-#endif
 
 #undef __msan_unpoison
 #undef __msan_test_shadow
@@ -32,6 +30,4 @@
 #    endif
 #endif
 
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
diff --git a/src/Common/NetlinkMetricsProvider.cpp b/src/Common/NetlinkMetricsProvider.cpp
index 6969b5b7542..172fede525a 100644
--- a/src/Common/NetlinkMetricsProvider.cpp
+++ b/src/Common/NetlinkMetricsProvider.cpp
@@ -22,10 +22,8 @@
 #include <linux/taskstats.h>
 #include <linux/capability.h>
 
-#if defined(__clang__)
-    #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-    #pragma clang diagnostic ignored "-Wnested-anon-types"
-#endif
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wnested-anon-types"
 
 /// Basic idea is motivated by "iotop" tool.
 /// More info: https://www.kernel.org/doc/Documentation/accounting/taskstats.txt
diff --git a/src/Common/StackTrace.h b/src/Common/StackTrace.h
index e5654162ecb..a16d889a67a 100644
--- a/src/Common/StackTrace.h
+++ b/src/Common/StackTrace.h
@@ -11,9 +11,7 @@
 
 #ifdef OS_DARWIN
 // ucontext is not available without _XOPEN_SOURCE
-#   ifdef __clang__
-#       pragma clang diagnostic ignored "-Wreserved-id-macro"
-#   endif
+#   pragma clang diagnostic ignored "-Wreserved-id-macro"
 #   define _XOPEN_SOURCE 700
 #endif
 #include <ucontext.h>
diff --git a/src/Common/TargetSpecific.h b/src/Common/TargetSpecific.h
index 229150ecccb..f9523f667b2 100644
--- a/src/Common/TargetSpecific.h
+++ b/src/Common/TargetSpecific.h
@@ -102,8 +102,6 @@ String toString(TargetArch arch);
 /// NOLINTNEXTLINE
 #define USE_MULTITARGET_CODE 1
 
-#if defined(__clang__)
-
 #define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2")))
 #define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi")))
 #define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw")))
@@ -134,45 +132,7 @@ String toString(TargetArch arch);
  * To prevent this warning we define this function inside every macros with pragmas.
  */
 #   define DUMMY_FUNCTION_DEFINITION [[maybe_unused]] void _dummy_function_definition();
-#else
 
-#define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native")))
-#define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native")))
-#define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native")))
-#define AVX512_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native")))
-#define AVX2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,bmi2,tune=native")))
-#define AVX_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,tune=native")))
-#define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt",tune=native)))
-#define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE
-
-#   define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
-        _Pragma("GCC push_options") \
-        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,tune=native\")")
-#   define BEGIN_AVX512VBMI_SPECIFIC_CODE \
-        _Pragma("GCC push_options") \
-        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,tune=native\")")
-#   define BEGIN_AVX512BW_SPECIFIC_CODE \
-        _Pragma("GCC push_options") \
-        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,tune=native\")")
-#   define BEGIN_AVX512F_SPECIFIC_CODE \
-        _Pragma("GCC push_options") \
-        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,tune=native\")")
-#   define BEGIN_AVX2_SPECIFIC_CODE \
-        _Pragma("GCC push_options") \
-        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,bmi2,tune=native\")")
-#   define BEGIN_AVX_SPECIFIC_CODE \
-        _Pragma("GCC push_options") \
-        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,tune=native\")")
-#   define BEGIN_SSE42_SPECIFIC_CODE \
-        _Pragma("GCC push_options") \
-        _Pragma("GCC target(\"sse,sse2,sse3,ssse3,sse4,popcnt,tune=native\")")
-#   define END_TARGET_SPECIFIC_CODE \
-        _Pragma("GCC pop_options")
-
-/* GCC doesn't show such warning, we don't need to define anything.
- */
-#   define DUMMY_FUNCTION_DEFINITION
-#endif
 
 #define DECLARE_SSE42_SPECIFIC_CODE(...) \
 BEGIN_SSE42_SPECIFIC_CODE \
diff --git a/src/Common/benchmarks/integer_hash_tables_and_hashes.cpp b/src/Common/benchmarks/integer_hash_tables_and_hashes.cpp
index 0bf13ef91ed..e6c09905ab8 100644
--- a/src/Common/benchmarks/integer_hash_tables_and_hashes.cpp
+++ b/src/Common/benchmarks/integer_hash_tables_and_hashes.cpp
@@ -26,10 +26,8 @@
 #include <pcg-random/pcg_random.hpp>
 #include <Common/randomSeed.h>
 
-#ifdef __clang__
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wused-but-marked-unused"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wused-but-marked-unused"
 #include <xxhash.h>
 
 using Key = UInt64;
@@ -385,6 +383,4 @@ OK_GOOGLE(TestRndInput, HashMap, TwoRoundsTwoVarsHash, elements_to_insert)
 OK_GOOGLE(TestRndInput, HashMap, WyHash, elements_to_insert)
 OK_GOOGLE(TestRndInput, HashMap, XXH3Hash, elements_to_insert)
 
-#ifdef __clang__
-#    pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
diff --git a/src/Common/intExp.h b/src/Common/intExp.h
index 69b0f09975a..25ae2a8a4b6 100644
--- a/src/Common/intExp.h
+++ b/src/Common/intExp.h
@@ -4,15 +4,7 @@
 #include <limits>
 
 #include <base/extended_types.h>
-
-// Also defined in Core/Defines.h
-#if !defined(NO_SANITIZE_UNDEFINED)
-#if defined(__clang__)
-    #define NO_SANITIZE_UNDEFINED __attribute__((__no_sanitize__("undefined")))
-#else
-    #define NO_SANITIZE_UNDEFINED
-#endif
-#endif
+#include <base/defines.h>
 
 
 /// On overflow, the function returns unspecified value.
diff --git a/src/Common/re2.h b/src/Common/re2.h
index c81b7157e91..ef1d2ba2a16 100644
--- a/src/Common/re2.h
+++ b/src/Common/re2.h
@@ -1,11 +1,6 @@
 #pragma once
 
-
-#ifdef __clang__
-#  pragma clang diagnostic push
-#  pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
 #include <re2/re2.h>
-#ifdef __clang__
-#  pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp
index 3f9b75e264d..d1d10dafb63 100644
--- a/src/Common/tests/gtest_DateLUTImpl.cpp
+++ b/src/Common/tests/gtest_DateLUTImpl.cpp
@@ -16,9 +16,7 @@
 
 
 /// For the expansion of gtest macros.
-#if defined(__clang__)
-    #pragma clang diagnostic ignored "-Wused-but-marked-unused"
-#endif
+#pragma clang diagnostic ignored "-Wused-but-marked-unused"
 
 // All timezones present at build time and embedded into ClickHouse binary.
 extern const char * auto_time_zones[];
diff --git a/src/Compression/LZ4_decompress_faster.cpp b/src/Compression/LZ4_decompress_faster.cpp
index c7f6571cb46..b548feed848 100644
--- a/src/Compression/LZ4_decompress_faster.cpp
+++ b/src/Compression/LZ4_decompress_faster.cpp
@@ -49,9 +49,7 @@ inline void copy8(UInt8 * dst, const UInt8 * src)
 inline void wildCopy8(UInt8 * dst, const UInt8 * src, const UInt8 * dst_end)
 {
     /// Unrolling with clang is doing >10% performance degrade.
-#if defined(__clang__)
     #pragma nounroll
-#endif
     do
     {
         copy8(dst, src);
@@ -234,9 +232,7 @@ inline void copy16(UInt8 * dst, const UInt8 * src)
 inline void wildCopy16(UInt8 * dst, const UInt8 * src, const UInt8 * dst_end)
 {
     /// Unrolling with clang is doing >10% performance degrade.
-#if defined(__clang__)
     #pragma nounroll
-#endif
     do
     {
         copy16(dst, src);
@@ -371,9 +367,7 @@ inline void copy32(UInt8 * dst, const UInt8 * src)
 inline void wildCopy32(UInt8 * dst, const UInt8 * src, const UInt8 * dst_end)
 {
     /// Unrolling with clang is doing >10% performance degrade.
-#if defined(__clang__)
     #pragma nounroll
-#endif
     do
     {
         copy32(dst, src);
@@ -487,9 +481,7 @@ bool NO_INLINE decompressImpl(const char * const source, char * const dest, size
     UInt8 * const output_end = op + dest_size;
 
     /// Unrolling with clang is doing >10% performance degrade.
-#if defined(__clang__)
     #pragma nounroll
-#endif
     while (true)
     {
         size_t length;
diff --git a/src/Formats/MarkInCompressedFile.h b/src/Formats/MarkInCompressedFile.h
index 92f4a030a1a..06ed1476410 100644
--- a/src/Formats/MarkInCompressedFile.h
+++ b/src/Formats/MarkInCompressedFile.h
@@ -12,10 +12,8 @@ namespace DB
 
 /// It's a bug in clang with three-way comparison operator
 /// https://github.com/llvm/llvm-project/issues/55919
-#ifdef __clang__
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
 
 /** Mark is the position in the compressed file. The compressed file consists of adjacent compressed blocks.
   * Mark is a tuple - the offset in the file to the start of the compressed block, the offset in the decompressed block to the start of the data.
@@ -41,9 +39,7 @@ struct MarkInCompressedFile
     }
 };
 
-#ifdef __clang__
-    #pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
 
 /**
  * In-memory representation of an array of marks.
diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h
index d0edd34e657..79b33e2f75b 100644
--- a/src/Functions/FunctionsHashing.h
+++ b/src/Functions/FunctionsHashing.h
@@ -9,10 +9,8 @@
 
 #include "config.h"
 
-#ifdef __clang__
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wused-but-marked-unused"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wused-but-marked-unused"
 #include <xxhash.h>
 
 #include <Common/SipHash.h>
@@ -1604,6 +1602,4 @@ using FunctionXXH3 = FunctionAnyHash<ImplXXH3>;
 using FunctionWyHash64 = FunctionAnyHash<ImplWyHash64>;
 }
 
-#ifdef __clang__
-#    pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
diff --git a/src/Functions/idna.cpp b/src/Functions/idna.cpp
index a73347400c6..c9682b44b2c 100644
--- a/src/Functions/idna.cpp
+++ b/src/Functions/idna.cpp
@@ -6,16 +6,12 @@
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionStringToString.h>
 
-#ifdef __clang__
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wnewline-eof"
-#endif
-#    include <ada/idna/to_ascii.h>
-#    include <ada/idna/to_unicode.h>
-#    include <ada/idna/unicode_transcoding.h>
-#ifdef __clang__
-#    pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wnewline-eof"
+#include <ada/idna/to_ascii.h>
+#include <ada/idna/to_unicode.h>
+#include <ada/idna/unicode_transcoding.h>
+#pragma clang diagnostic pop
 
 namespace DB
 {
@@ -199,4 +195,3 @@ Computes the Unicode representation of ASCII-encoded Internationalized Domain Na
 }
 
 #endif
-
diff --git a/src/Functions/punycode.cpp b/src/Functions/punycode.cpp
index 159189744bd..107302069b4 100644
--- a/src/Functions/punycode.cpp
+++ b/src/Functions/punycode.cpp
@@ -6,15 +6,11 @@
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionStringToString.h>
 
-#ifdef __clang__
 #    pragma clang diagnostic push
 #    pragma clang diagnostic ignored "-Wnewline-eof"
-#endif
 #    include <ada/idna/punycode.h>
 #    include <ada/idna/unicode_transcoding.h>
-#ifdef __clang__
 #    pragma clang diagnostic pop
-#endif
 
 namespace DB
 {
diff --git a/src/Functions/s2_fwd.h b/src/Functions/s2_fwd.h
index 6e0b58ae118..4ed5d4fcc1b 100644
--- a/src/Functions/s2_fwd.h
+++ b/src/Functions/s2_fwd.h
@@ -1,8 +1,6 @@
 #pragma once
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wambiguous-reversed-operator"
-#endif
 
 #include <s2/s2latlng.h>
 #include <s2/s2cell_id.h>
@@ -11,6 +9,4 @@
 #include <s2/s2cap.h>
 #include <s2/s1angle.h>
 
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp
index e9276c4aefb..fbabc801913 100644
--- a/src/Functions/seriesDecomposeSTL.cpp
+++ b/src/Functions/seriesDecomposeSTL.cpp
@@ -1,15 +1,9 @@
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wold-style-cast"
 #pragma clang diagnostic ignored "-Wshadow"
 #pragma clang diagnostic ignored "-Wimplicit-float-conversion"
-#endif
-
 #include <Functions/stl.hpp>
-
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
 
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnConst.h>
diff --git a/src/Functions/seriesPeriodDetectFFT.cpp b/src/Functions/seriesPeriodDetectFFT.cpp
index 61e3319d810..c01f6b7f07b 100644
--- a/src/Functions/seriesPeriodDetectFFT.cpp
+++ b/src/Functions/seriesPeriodDetectFFT.cpp
@@ -1,18 +1,14 @@
 #include "config.h"
 
 #if USE_POCKETFFT
-#    ifdef __clang__
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Wshadow"
-#        pragma clang diagnostic ignored "-Wextra-semi-stmt"
-#        pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
-#    endif
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wshadow"
+#    pragma clang diagnostic ignored "-Wextra-semi-stmt"
+#    pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
 
 #    include <pocketfft_hdronly.h>
 
-#    ifdef __clang__
-#        pragma clang diagnostic pop
-#    endif
+#    pragma clang diagnostic pop
 
 #    include <cmath>
 #    include <Columns/ColumnArray.h>
diff --git a/src/IO/Archives/ArchiveUtils.h b/src/IO/Archives/ArchiveUtils.h
index 810b9d8d730..1b66be005a2 100644
--- a/src/IO/Archives/ArchiveUtils.h
+++ b/src/IO/Archives/ArchiveUtils.h
@@ -4,11 +4,9 @@
 
 #if USE_LIBARCHIVE
 
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreserved-macro-identifier"
 
 #include <archive.h>
 #include <archive_entry.h>
 #endif
-#endif
diff --git a/src/IO/DoubleConverter.h b/src/IO/DoubleConverter.h
index 18cbe4e3a1d..45721da5248 100644
--- a/src/IO/DoubleConverter.h
+++ b/src/IO/DoubleConverter.h
@@ -1,17 +1,13 @@
 #pragma once
 
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdouble-promotion"
-#endif
 
 #include <base/defines.h>
 #include <double-conversion/double-conversion.h>
 #include <boost/noncopyable.hpp>
 
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
 
 
 namespace DB
diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h
index f438990fd1c..8b743e6351b 100644
--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@@ -39,15 +39,11 @@
 #include <IO/WriteBufferFromString.h>
 #include <IO/WriteBufferFromFileDescriptor.h>
 
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunused-parameter"
 #pragma clang diagnostic ignored "-Wsign-compare"
-#endif
 #include <dragonbox/dragonbox_to_chars.h>
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
 
 #include <Formats/FormatSettings.h>
 
diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h
index 51964636389..597f0a06fb9 100644
--- a/src/IO/readFloatText.h
+++ b/src/IO/readFloatText.h
@@ -6,14 +6,10 @@
 #include <Common/StringUtils/StringUtils.h>
 #include <double-conversion/double-conversion.h>
 
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wunneeded-internal-declaration"
-#endif
 #include <fast_float/fast_float.h>
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
 
 /** Methods for reading floating point numbers from text with decimal representation.
   * There are "precise", "fast" and "simple" implementations.
diff --git a/src/Interpreters/examples/hash_map_string.cpp b/src/Interpreters/examples/hash_map_string.cpp
index f3ec104a5f7..f30a9a4cac1 100644
--- a/src/Interpreters/examples/hash_map_string.cpp
+++ b/src/Interpreters/examples/hash_map_string.cpp
@@ -20,9 +20,7 @@
 #include <Common/HashTable/HashMap.h>
 #include <Interpreters/AggregationCommon.h>
 
-#if defined(__clang__)
-    #pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
-#endif
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
 
 
 struct CompactStringRef
diff --git a/src/Parsers/ExpressionListParsers.h b/src/Parsers/ExpressionListParsers.h
index 6dba5a9c31f..235d5782630 100644
--- a/src/Parsers/ExpressionListParsers.h
+++ b/src/Parsers/ExpressionListParsers.h
@@ -9,10 +9,8 @@
 #include <Parsers/SelectUnionMode.h>
 #include <Common/IntervalKind.h>
 
-#ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wc99-extensions"
-#endif
 
 namespace DB
 {
@@ -297,6 +295,4 @@ protected:
 
 }
 
-#ifdef __clang__
 #pragma clang diagnostic pop
-#endif
diff --git a/src/Storages/MergeTree/MergeTreeDataPartType.h b/src/Storages/MergeTree/MergeTreeDataPartType.h
index 8b06da5167e..5096ee86db1 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartType.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartType.h
@@ -26,10 +26,8 @@ static E parseEnum(const String & str)
 
 /// It's a bug in clang with three-way comparison operator
 /// https://github.com/llvm/llvm-project/issues/55919
-#ifdef __clang__
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
 
 /// Types of data part format.
 class MergeTreeDataPartType
@@ -86,9 +84,7 @@ private:
     Value value;
 };
 
-#ifdef __clang__
-    #pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
 
 struct MergeTreeDataPartFormat
 {
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index da90dbb4076..dce51ada042 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -68,14 +68,10 @@
 
 #include <boost/algorithm/string.hpp>
 
-#ifdef __clang__
-#  pragma clang diagnostic push
-#  pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
-#endif
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
 #include <re2/re2.h>
-#ifdef __clang__
-#  pragma clang diagnostic pop
-#endif
+#pragma clang diagnostic pop
 
 namespace fs = std::filesystem;
 

From fc6f3c8399717891da40b820aa35385bd80a4540 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 00:49:22 +0100
Subject: [PATCH 34/66] Live view's life is close to the end

---
 src/Common/ErrorCodes.cpp                 |  1 -
 src/Core/Block.h                          |  1 -
 src/NOTICE                                |  9 ---
 src/Parsers/ASTAlterQuery.cpp             |  3 -
 src/Parsers/ASTAlterQuery.h               |  3 -
 src/Parsers/ASTCreateQuery.cpp            |  7 ---
 src/Parsers/ASTCreateQuery.h              |  1 -
 src/Parsers/ParserAlterQuery.cpp          |  4 --
 src/Parsers/ParserCreateQuery.cpp         | 24 -------
 src/Storages/LiveView/StorageLiveView.cpp | 77 +----------------------
 src/Storages/LiveView/StorageLiveView.h   | 12 +---
 11 files changed, 4 insertions(+), 138 deletions(-)

diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index ca00f2fd513..f7482d44b66 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -379,7 +379,6 @@
     M(467, CANNOT_PARSE_BOOL) \
     M(468, CANNOT_PTHREAD_ATTR) \
     M(469, VIOLATED_CONSTRAINT) \
-    M(470, QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW) \
     M(471, INVALID_SETTING_VALUE) \
     M(472, READONLY_SETTING) \
     M(473, DEADLOCK_AVOIDED) \
diff --git a/src/Core/Block.h b/src/Core/Block.h
index 1a4f8c2e446..c8bebb4552a 100644
--- a/src/Core/Block.h
+++ b/src/Core/Block.h
@@ -177,7 +177,6 @@ using BlockPtr = std::shared_ptr<Block>;
 using Blocks = std::vector<Block>;
 using BlocksList = std::list<Block>;
 using BlocksPtr = std::shared_ptr<Blocks>;
-using BlocksPtrs = std::shared_ptr<std::vector<BlocksPtr>>;
 
 /// Extends block with extra data in derived classes
 struct ExtraBlock
diff --git a/src/NOTICE b/src/NOTICE
index c68280b1529..4e5f66c65c9 100644
--- a/src/NOTICE
+++ b/src/NOTICE
@@ -13,18 +13,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 
-Common/ErrorCodes.cpp
-Common/UInt128.h
-Core/Block.h
-Core/Defines.h
-Core/Settings.h
-Databases/DatabasesCommon.cpp
-IO/WriteBufferValidUTF8.cpp
-Interpreters/InterpreterAlterQuery.cpp
 Interpreters/InterpreterCreateQuery.cpp
 Interpreters/InterpreterFactory.cpp
 Parsers/ASTAlterQuery.cpp
-Parsers/ASTAlterQuery.h
 Parsers/ASTCreateQuery.cpp
 Parsers/ASTCreateQuery.h
 Parsers/ParserAlterQuery.cpp
diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp
index 605cc4ade42..a93ad1d1746 100644
--- a/src/Parsers/ASTAlterQuery.cpp
+++ b/src/Parsers/ASTAlterQuery.cpp
@@ -60,8 +60,6 @@ ASTPtr ASTAlterCommand::clone() const
         res->settings_resets = res->children.emplace_back(settings_resets->clone()).get();
     if (select)
         res->select = res->children.emplace_back(select->clone()).get();
-    if (values)
-        res->values = res->children.emplace_back(values->clone()).get();
     if (rename_to)
         res->rename_to = res->children.emplace_back(rename_to->clone()).get();
 
@@ -518,7 +516,6 @@ void ASTAlterCommand::forEachPointerToChild(std::function<void(void**)> f)
     f(reinterpret_cast<void **>(&settings_changes));
     f(reinterpret_cast<void **>(&settings_resets));
     f(reinterpret_cast<void **>(&select));
-    f(reinterpret_cast<void **>(&values));
     f(reinterpret_cast<void **>(&rename_to));
 }
 
diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h
index 867ebf26194..1799b75fce4 100644
--- a/src/Parsers/ASTAlterQuery.h
+++ b/src/Parsers/ASTAlterQuery.h
@@ -166,9 +166,6 @@ public:
     /// For MODIFY_SQL_SECURITY
     IAST * sql_security = nullptr;
 
-    /// In ALTER CHANNEL, ADD, DROP, SUSPEND, RESUME, REFRESH, MODIFY queries, the list of live views is stored here
-    IAST * values = nullptr;
-
     /// Target column name
     IAST * rename_to = nullptr;
 
diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp
index e8ccb8e9377..1315ea5784c 100644
--- a/src/Parsers/ASTCreateQuery.cpp
+++ b/src/Parsers/ASTCreateQuery.cpp
@@ -348,13 +348,6 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat
             settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM " << (settings.hilite ? hilite_none : "")
                           << quoteString(*attach_from_path);
 
-        if (live_view_periodic_refresh)
-        {
-            settings.ostr << (settings.hilite ? hilite_keyword : "") << " WITH" << (settings.hilite ? hilite_none : "")
-                << (settings.hilite ? hilite_keyword : "") << " PERIODIC REFRESH " << (settings.hilite ? hilite_none : "")
-                << *live_view_periodic_refresh;
-        }
-
         formatOnCluster(settings);
     }
     else
diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h
index aeb84d754e3..64e6bc8ce48 100644
--- a/src/Parsers/ASTCreateQuery.h
+++ b/src/Parsers/ASTCreateQuery.h
@@ -122,7 +122,6 @@ public:
     ASTDictionary * dictionary = nullptr; /// dictionary definition (layout, primary key, etc.)
 
     ASTRefreshStrategy * refresh_strategy = nullptr; // For CREATE MATERIALIZED VIEW ... REFRESH ...
-    std::optional<UInt64> live_view_periodic_refresh;    /// For CREATE LIVE VIEW ... WITH [PERIODIC] REFRESH ...
 
     bool is_watermark_strictly_ascending{false}; /// STRICTLY ASCENDING WATERMARK STRATEGY FOR WINDOW VIEW
     bool is_watermark_ascending{false}; /// ASCENDING WATERMARK STRATEGY FOR WINDOW VIEW
diff --git a/src/Parsers/ParserAlterQuery.cpp b/src/Parsers/ParserAlterQuery.cpp
index 495e91b96d5..b1cc7622e00 100644
--- a/src/Parsers/ParserAlterQuery.cpp
+++ b/src/Parsers/ParserAlterQuery.cpp
@@ -138,7 +138,6 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
     ParserList parser_reset_setting(
         std::make_unique<ParserIdentifier>(), std::make_unique<ParserToken>(TokenType::Comma),
         /* allow_empty = */ false);
-    ParserNameList values_p;
     ParserSelectWithUnionQuery select_p;
     ParserSQLSecurity sql_security_p;
     ParserRefreshStrategy refresh_p;
@@ -163,7 +162,6 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
     ASTPtr command_settings_changes;
     ASTPtr command_settings_resets;
     ASTPtr command_select;
-    ASTPtr command_values;
     ASTPtr command_rename_to;
     ASTPtr command_sql_security;
 
@@ -944,8 +942,6 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
         command->settings_resets = command->children.emplace_back(std::move(command_settings_resets)).get();
     if (command_select)
         command->select = command->children.emplace_back(std::move(command_select)).get();
-    if (command_values)
-        command->values = command->children.emplace_back(std::move(command_values)).get();
     if (command_sql_security)
         command->sql_security = command->children.emplace_back(std::move(command_sql_security)).get();
     if (command_rename_to)
diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp
index 3c86ed6b518..8ebadf4606f 100644
--- a/src/Parsers/ParserCreateQuery.cpp
+++ b/src/Parsers/ParserCreateQuery.cpp
@@ -917,15 +917,11 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e
     ASTPtr as_database;
     ASTPtr as_table;
     ASTPtr select;
-    ASTPtr live_view_periodic_refresh;
     ASTPtr sql_security;
 
     String cluster_str;
     bool attach = false;
     bool if_not_exists = false;
-    bool with_and = false;
-    bool with_timeout = false;
-    bool with_periodic_refresh = false;
 
     if (!s_create.ignore(pos, expected))
     {
@@ -949,23 +945,6 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e
     if (!table_name_p.parse(pos, table, expected))
         return false;
 
-    if (ParserKeyword{"WITH"}.ignore(pos, expected))
-    {
-        if (ParserKeyword{"REFRESH"}.ignore(pos, expected) || ParserKeyword{"PERIODIC REFRESH"}.ignore(pos, expected))
-        {
-            if (!ParserNumber{}.parse(pos, live_view_periodic_refresh, expected))
-                live_view_periodic_refresh = std::make_shared<ASTLiteral>(static_cast<UInt64>(60));
-
-            with_periodic_refresh = true;
-        }
-
-        else if (with_and)
-            return false;
-
-        if (!with_timeout && !with_periodic_refresh)
-            return false;
-    }
-
     if (ParserKeyword{"ON"}.ignore(pos, expected))
     {
         if (!ASTQueryWithOnCluster::parse(pos, cluster_str, expected))
@@ -1028,9 +1007,6 @@ bool ParserCreateLiveViewQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & e
     tryGetIdentifierNameInto(as_table, query->as_table);
     query->set(query->select, select);
 
-    if (live_view_periodic_refresh)
-        query->live_view_periodic_refresh.emplace(live_view_periodic_refresh->as<ASTLiteral &>().value.safeGet<UInt64>());
-
     if (comment)
         query->set(query->comment, comment);
 
diff --git a/src/Storages/LiveView/StorageLiveView.cpp b/src/Storages/LiveView/StorageLiveView.cpp
index 2f011567b90..958e0a326cf 100644
--- a/src/Storages/LiveView/StorageLiveView.cpp
+++ b/src/Storages/LiveView/StorageLiveView.cpp
@@ -57,7 +57,7 @@ namespace ErrorCodes
 {
     extern const int INCORRECT_QUERY;
     extern const int TABLE_WAS_NOT_DROPPED;
-    extern const int QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW;
+    extern const int NOT_IMPLEMENTED;
     extern const int SUPPORT_IS_DISABLED;
     extern const int UNSUPPORTED_METHOD;
 }
@@ -86,14 +86,14 @@ SelectQueryDescription buildSelectQueryDescription(const ASTPtr & select_query,
         if (inner_select_with_union_query)
         {
             if (inner_select_with_union_query->list_of_selects->children.size() != 1)
-                throw Exception(ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW, "UNION is not supported for LIVE VIEW");
+                throw Exception(ErrorCodes::NOT_IMPLEMENTED, "UNION is not supported for LIVE VIEW");
 
             inner_query = inner_select_with_union_query->list_of_selects->children[0];
         }
 
         auto * inner_select_query = inner_query->as<ASTSelectQuery>();
         if (!inner_select_query)
-            throw Exception(DB::ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_LIVE_VIEW,
+            throw Exception(DB::ErrorCodes::NOT_IMPLEMENTED,
                 "LIVE VIEWs are only supported for queries from tables, "
                 "but there is no table name in select query.");
 
@@ -226,29 +226,9 @@ StorageLiveView::StorageLiveView(
 
     DatabaseCatalog::instance().addViewDependency(select_query_description.select_table_id, table_id_);
 
-    if (query.live_view_periodic_refresh)
-    {
-        is_periodically_refreshed = true;
-        periodic_live_view_refresh = Seconds {*query.live_view_periodic_refresh};
-    }
-
     blocks_ptr = std::make_shared<BlocksPtr>();
     blocks_metadata_ptr = std::make_shared<BlocksMetadataPtr>();
     active_ptr = std::make_shared<bool>(true);
-
-    periodic_refresh_task = getContext()->getSchedulePool().createTask("LiveViewPeriodicRefreshTask",
-        [this]
-        {
-            try
-            {
-                periodicRefreshTaskFunc();
-            }
-            catch (...)
-            {
-                tryLogCurrentException(log, "Exception in LiveView periodic refresh task in BackgroundSchedulePool");
-            }
-        });
-    periodic_refresh_task->deactivate();
 }
 
 StorageLiveView::~StorageLiveView()
@@ -285,17 +265,12 @@ void StorageLiveView::drop()
 
 void StorageLiveView::startup()
 {
-    if (is_periodically_refreshed)
-        periodic_refresh_task->activate();
 }
 
 void StorageLiveView::shutdown(bool)
 {
     shutdown_called = true;
 
-    if (is_periodically_refreshed)
-        periodic_refresh_task->deactivate();
-
     DatabaseCatalog::instance().removeViewDependency(select_query_description.select_table_id, getStorageID());
 }
 
@@ -311,17 +286,7 @@ Pipe StorageLiveView::read(
     std::lock_guard lock(mutex);
 
     if (!(*blocks_ptr))
-    {
         refreshImpl(lock);
-    }
-    else if (is_periodically_refreshed)
-    {
-        Seconds current_time = std::chrono::duration_cast<Seconds>(std::chrono::system_clock::now().time_since_epoch());
-        Seconds blocks_time = std::chrono::duration_cast<Seconds>(getBlocksTime(lock).time_since_epoch());
-
-        if ((current_time - periodic_live_view_refresh) >= blocks_time)
-            refreshImpl(lock);
-    }
 
     return Pipe(std::make_shared<BlocksSource>(*blocks_ptr, getHeader()));
 }
@@ -362,9 +327,6 @@ Pipe StorageLiveView::watch(
 
         if (!(*blocks_ptr))
             refreshImpl(lock);
-
-        if (is_periodically_refreshed)
-            scheduleNextPeriodicRefresh(lock);
     }
 
     processed_stage = QueryProcessingStage::Complete;
@@ -746,39 +708,6 @@ bool StorageLiveView::getNewBlocks(const std::lock_guard<std::mutex> & lock)
     return updated;
 }
 
-void StorageLiveView::periodicRefreshTaskFunc()
-{
-    LOG_TRACE(log, "periodic refresh task");
-
-    std::lock_guard lock(mutex);
-
-    if (hasActiveUsers(lock))
-        scheduleNextPeriodicRefresh(lock);
-}
-
-void StorageLiveView::scheduleNextPeriodicRefresh(const std::lock_guard<std::mutex> & lock)
-{
-    Seconds current_time = std::chrono::duration_cast<Seconds>(std::chrono::system_clock::now().time_since_epoch());
-    Seconds blocks_time = std::chrono::duration_cast<Seconds>(getBlocksTime(lock).time_since_epoch());
-
-    if ((current_time - periodic_live_view_refresh) >= blocks_time)
-    {
-        refreshImpl(lock);
-        blocks_time = std::chrono::duration_cast<Seconds>(getBlocksTime(lock).time_since_epoch());
-    }
-    current_time = std::chrono::duration_cast<Seconds>(std::chrono::system_clock::now().time_since_epoch());
-
-    auto next_refresh_time = blocks_time + periodic_live_view_refresh;
-
-    if (current_time >= next_refresh_time)
-        periodic_refresh_task->scheduleAfter(0);
-    else
-    {
-        auto schedule_time = std::chrono::duration_cast<MilliSeconds> (next_refresh_time - current_time);
-        periodic_refresh_task->scheduleAfter(static_cast<size_t>(schedule_time.count()));
-    }
-}
-
 void registerStorageLiveView(StorageFactory & factory)
 {
     factory.registerStorage("LiveView", [](const StorageFactory::Arguments & args)
diff --git a/src/Storages/LiveView/StorageLiveView.h b/src/Storages/LiveView/StorageLiveView.h
index 6b8780cb81b..bf6b13fc837 100644
--- a/src/Storages/LiveView/StorageLiveView.h
+++ b/src/Storages/LiveView/StorageLiveView.h
@@ -21,6 +21,7 @@ limitations under the License. */
 namespace DB
 {
 
+using BlocksPtrs = std::shared_ptr<std::vector<BlocksPtr>>;
 
 struct BlocksMetadata
 {
@@ -172,11 +173,6 @@ private:
     /// Read new data blocks that store query result
     bool getNewBlocks(const std::lock_guard<std::mutex> & lock);
 
-    void periodicRefreshTaskFunc();
-
-    /// Must be called with mutex locked
-    void scheduleNextPeriodicRefresh(const std::lock_guard<std::mutex> & lock);
-
     SelectQueryDescription select_query_description;
 
     /// Query over the mergeable blocks to produce final result
@@ -186,9 +182,6 @@ private:
 
     LoggerPtr log;
 
-    bool is_periodically_refreshed = false;
-    Seconds periodic_live_view_refresh;
-
     /// Mutex to protect access to sample block and inner_blocks_query
     mutable std::mutex sample_block_lock;
     mutable Block sample_block;
@@ -208,9 +201,6 @@ private:
     MergeableBlocksPtr mergeable_blocks;
 
     std::atomic<bool> shutdown_called = false;
-
-    /// Periodic refresh task used when [PERIODIC] REFRESH is specified in create statement
-    BackgroundSchedulePool::TaskHolder periodic_refresh_task;
 };
 
 }

From 9a5085a4c309ace6ded156de6b969b14131d2cfe Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 02:56:22 +0100
Subject: [PATCH 35/66] Fix buffer overflow in CompressionCodecMultiple

---
 src/Compression/CompressionCodecMultiple.cpp  | 24 +++++++++++++++++--
 ...3_codec_multiple_buffer_overflow.reference |  1 +
 .../03003_codec_multiple_buffer_overflow.sh   |  8 +++++++
 3 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference
 create mode 100755 tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh

diff --git a/src/Compression/CompressionCodecMultiple.cpp b/src/Compression/CompressionCodecMultiple.cpp
index b1eb7fb50c3..801609bbe14 100644
--- a/src/Compression/CompressionCodecMultiple.cpp
+++ b/src/Compression/CompressionCodecMultiple.cpp
@@ -88,14 +88,34 @@ void CompressionCodecMultiple::doDecompressData(const char * source, UInt32 sour
         const auto codec = CompressionCodecFactory::instance().get(compression_method);
         auto additional_size_at_the_end_of_buffer = codec->getAdditionalSizeAtTheEndOfBuffer();
 
-        compressed_buf.resize(compressed_buf.size() + additional_size_at_the_end_of_buffer);
+        if (compressed_buf.size() >= 1_GiB)
+            throw Exception(decompression_error_code, "Too large compressed size: {}", compressed_buf.size());
+
+        {
+            UInt32 bytes_to_resize;
+            if (common::addOverflow(static_cast<UInt32>(compressed_buf.size()), additional_size_at_the_end_of_buffer, bytes_to_resize))
+                throw Exception(decompression_error_code, "Too large compressed size: {}", compressed_buf.size());
+
+            compressed_buf.resize(compressed_buf.size() + additional_size_at_the_end_of_buffer);
+        }
+
         UInt32 uncompressed_size = readDecompressedBlockSize(compressed_buf.data());
 
+        if (uncompressed_size >= 1_GiB)
+            throw Exception(decompression_error_code, "Too large uncompressed size: {}", uncompressed_size);
+
         if (idx == 0 && uncompressed_size != decompressed_size)
             throw Exception(decompression_error_code, "Wrong final decompressed size in codec Multiple, got {}, expected {}",
                 uncompressed_size, decompressed_size);
 
-        uncompressed_buf.resize(uncompressed_size + additional_size_at_the_end_of_buffer);
+        {
+            UInt32 bytes_to_resize;
+            if (common::addOverflow(uncompressed_size, additional_size_at_the_end_of_buffer, bytes_to_resize))
+                throw Exception(decompression_error_code, "Too large uncompressed size: {}", uncompressed_size);
+
+            uncompressed_buf.resize(bytes_to_resize);
+        }
+
         codec->decompress(compressed_buf.data(), source_size, uncompressed_buf.data());
         uncompressed_buf.swap(compressed_buf);
         source_size = uncompressed_size;
diff --git a/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference
new file mode 100644
index 00000000000..b6db4b31fcb
--- /dev/null
+++ b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference
@@ -0,0 +1 @@
+Too large
diff --git a/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh
new file mode 100755
index 00000000000..3a1537356c2
--- /dev/null
+++ b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+echo -ne 'checksumchecksum\x91\xa4\x0a\x00\x00\x41\x00\x00\x20\x41\x41\x41\x40\x41\x00\x41\x41\x41\x41\x40\x41\x00\x00\x00\x00\x00\x0c\x00\x20\x41\x41\xbe\x22\x41\x41\x41\x41\x41\x00\x00\x00\x00\x00\x01\xfe\x7f\x00\x00\x41\x00\x00\x00\x41\x92\x6b\x00\x41\x41\x0b\x00\x00\x00\x00\x00\x41\x92\x6b\x00\x41\x41\x0b\x00\x00\x82\x82\x82\x82\x63\x82\xff\xff\xff\xff\xff\xff\xff\xff\x95\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x41\x41\x41\x41\x41\x41\x41\x41\x41\x40\x08\x08\x08\x08\x08\x08\x00\x06\x00\x00\x00\x08\x00\x20\x00\x00\xef\xff\xff\xff\xe1\x40\x26\x41\x00\x1d\x01\x00\x00\x41\x42\x0b\xff\xff\xff\xe4\x41\x41\x4e\x41\x41\x06\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x7e\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x00\x04\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x9c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x4f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x6c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\xa9\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x4f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x6c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x9b\x8f\x8f\x8f\x20\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f' |
+    ${CLICKHOUSE_CURL} "${CLICKHOUSE_URL}&decompress=1&http_native_compression_disable_checksumming_on_decompress=1" --data-binary @- | grep -o -F 'Too large'

From 7609856cd54b7a8f214182ac9d805bba586cda1b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 03:06:27 +0100
Subject: [PATCH 36/66] Fix another error

---
 src/Compression/CompressionCodecMultiple.cpp                | 5 -----
 src/Compression/CompressionCodecNone.cpp                    | 6 +++++-
 src/Compression/CompressionCodecNone.h                      | 2 --
 .../03003_codec_multiple_buffer_overflow.reference          | 1 +
 .../0_stateless/03003_codec_multiple_buffer_overflow.sh     | 3 +++
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/Compression/CompressionCodecMultiple.cpp b/src/Compression/CompressionCodecMultiple.cpp
index 801609bbe14..6dc10677a3f 100644
--- a/src/Compression/CompressionCodecMultiple.cpp
+++ b/src/Compression/CompressionCodecMultiple.cpp
@@ -1,14 +1,9 @@
 #include <Compression/CompressionCodecMultiple.h>
 #include <Compression/CompressionInfo.h>
 #include <Common/PODArray.h>
-#include <base/unaligned.h>
 #include <Compression/CompressionFactory.h>
-#include <Parsers/ASTExpressionList.h>
-#include <Parsers/ASTFunction.h>
 #include <IO/WriteHelpers.h>
 #include <IO/WriteBufferFromString.h>
-#include <IO/Operators.h>
-#include <base/hex.h>
 
 
 namespace DB
diff --git a/src/Compression/CompressionCodecNone.cpp b/src/Compression/CompressionCodecNone.cpp
index 065ac4a2625..53d62e51920 100644
--- a/src/Compression/CompressionCodecNone.cpp
+++ b/src/Compression/CompressionCodecNone.cpp
@@ -27,8 +27,12 @@ UInt32 CompressionCodecNone::doCompressData(const char * source, UInt32 source_s
     return source_size;
 }
 
-void CompressionCodecNone::doDecompressData(const char * source, UInt32 /*source_size*/, char * dest, UInt32 uncompressed_size) const
+void CompressionCodecNone::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
 {
+    if (source_size != uncompressed_size)
+        throw Exception(decompression_error_code, "Wrong data for compression codec NONE: source_size ({}) != uncompressed_size ({})",
+            source_size, uncompressed_size);
+
     memcpy(dest, source, uncompressed_size);
 }
 
diff --git a/src/Compression/CompressionCodecNone.h b/src/Compression/CompressionCodecNone.h
index 1565720947d..5d6f135b351 100644
--- a/src/Compression/CompressionCodecNone.h
+++ b/src/Compression/CompressionCodecNone.h
@@ -18,9 +18,7 @@ public:
     void updateHash(SipHash & hash) const override;
 
 protected:
-
     UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
-
     void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
 
     bool isCompression() const override { return false; }
diff --git a/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference
index b6db4b31fcb..93d120dac01 100644
--- a/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference
+++ b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.reference
@@ -1 +1,2 @@
 Too large
+Wrong data
diff --git a/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh
index 3a1537356c2..93290f62c58 100755
--- a/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh
+++ b/tests/queries/0_stateless/03003_codec_multiple_buffer_overflow.sh
@@ -6,3 +6,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 
 echo -ne 'checksumchecksum\x91\xa4\x0a\x00\x00\x41\x00\x00\x20\x41\x41\x41\x40\x41\x00\x41\x41\x41\x41\x40\x41\x00\x00\x00\x00\x00\x0c\x00\x20\x41\x41\xbe\x22\x41\x41\x41\x41\x41\x00\x00\x00\x00\x00\x01\xfe\x7f\x00\x00\x41\x00\x00\x00\x41\x92\x6b\x00\x41\x41\x0b\x00\x00\x00\x00\x00\x41\x92\x6b\x00\x41\x41\x0b\x00\x00\x82\x82\x82\x82\x63\x82\xff\xff\xff\xff\xff\xff\xff\xff\x95\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x41\x41\x41\x41\x41\x41\x41\x41\x41\x40\x08\x08\x08\x08\x08\x08\x00\x06\x00\x00\x00\x08\x00\x20\x00\x00\xef\xff\xff\xff\xe1\x40\x26\x41\x00\x1d\x01\x00\x00\x41\x42\x0b\xff\xff\xff\xe4\x41\x41\x4e\x41\x41\x06\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x7e\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x00\x04\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x9c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x4f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x31\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x6c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\xa9\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x4f\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x5c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x6c\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x9b\x8f\x8f\x8f\x20\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f\x8f' |
     ${CLICKHOUSE_CURL} "${CLICKHOUSE_URL}&decompress=1&http_native_compression_disable_checksumming_on_decompress=1" --data-binary @- | grep -o -F 'Too large'
+
+echo -ne 'checksumchecksum\x91\x2b\x01\x00\x00\xbe\xe1\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x00\x04\x04\x04\x00\x08\x7f\x00\x01\x06\x82\x82\x82\x82\x82\x82\x82\x80\x41\x41\x41\x41\x41\x9a\x75\x6d\x63\x68\x65\x63\x6b\x73\x6d\x63\x68\x65\x63\x6b\x73\x75\x00\x00\x00\x41\x41\x41\x42\x64\x41\x41\x41\x0c\x00\x1c\x41\x41\xbe\x22\x41\x41\x00\x00\x00\x00\x11\x00\x41\x41\x75\x00\x00\x00\x41\x41\x41\x42\x64\x41\x41\x41\x0c\x00\x20\x41\x41\xbe\x22\x41\x41\x41\x41\x41\x00\x00\x00\x00\x00\x01\x14\xff\x7f\x00\x41\x00\x00\x00\x00\x00\x00\x41\x41\x75\x00\x00\x00\x41\x41\x41\x42\x64\x41\x61\x41\x0c\x00\x20\x41\x41\xbe\x22\x41\x41\x41\x00\x41\x14\x14\x41\x14\x14\x14\x14\x14\x14\x14\x14\x14\x14\x14\x0f\x0f\x0f\x0f\x0f\x41\x41\x41\x41\x64\x00\x30\x00\xcf\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x41\x41\x0b\x00\x00\x00\x41\x41\x41\xe8\x1f\xe1\x00\x01\x00\xff\x00\x41\x41\xbf\x41\x41\x40\x40\xe1\x00\x00\x00\x00\x1a\x00\x20\x00\x00\x00\x41\x00\x00\x00\x42\x64\x41\x41\x41\x0c\x00\x1c\x41\x41\xbe\x22\x41\x41\x00\x00\x00\x00\x00\x00\x41\x41\x75\x00\x00\x00\x41\x41\x41\x42\x64\x00\x00\x0b\x00\xe6\xff\x00\x00\x00\x00\x00' |
+    ${CLICKHOUSE_CURL} "${CLICKHOUSE_URL}&decompress=1&http_native_compression_disable_checksumming_on_decompress=1" --data-binary @- | grep -o -F 'Wrong data'

From 3476e36e87f720ef719c93621d2854ca364d2510 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 05:50:43 +0300
Subject: [PATCH 37/66] Update build.md

1. Add `apt-get update` command.
2. Put Docker to the bottom.
---
 docs/en/development/build.md | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/docs/en/development/build.md b/docs/en/development/build.md
index b474c445604..acdde7b9245 100644
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@@ -14,20 +14,6 @@ Supported platforms:
 - PowerPC 64 LE (experimental)
 - RISC-V 64 (experimental)
 
-## Building in docker
-We use the docker image `clickhouse/binary-builder` for our CI builds. It contains everything necessary to build the binary and packages. There is a script `docker/packager/packager` to ease the image usage:
-
-```bash
-# define a directory for the output artifacts
-output_dir="build_results"
-# a simplest build
-./docker/packager/packager --package-type=binary --output-dir "$output_dir"
-# build debian packages
-./docker/packager/packager --package-type=deb --output-dir "$output_dir"
-# by default, debian packages use thin LTO, so we can override it to speed up the build
-CMAKE_FLAGS='-DENABLE_THINLTO=' ./docker/packager/packager --package-type=deb --output-dir "./$(git rev-parse --show-cdup)/build_results"
-```
-
 ## Building on Ubuntu
 
 The following tutorial is based on Ubuntu Linux.
@@ -37,6 +23,7 @@ The minimum recommended Ubuntu version for development is 22.04 LTS.
 ### Install Prerequisites {#install-prerequisites}
 
 ``` bash
+sudo apt-get update
 sudo apt-get install git cmake ccache python3 ninja-build nasm yasm gawk lsb-release wget software-properties-common gnupg
 ```
 
@@ -133,3 +120,17 @@ mkdir build
 cmake -S . -B build
 cmake --build build
 ```
+
+## Building in docker
+We use the docker image `clickhouse/binary-builder` for our CI builds. It contains everything necessary to build the binary and packages. There is a script `docker/packager/packager` to ease the image usage:
+
+```bash
+# define a directory for the output artifacts
+output_dir="build_results"
+# a simplest build
+./docker/packager/packager --package-type=binary --output-dir "$output_dir"
+# build debian packages
+./docker/packager/packager --package-type=deb --output-dir "$output_dir"
+# by default, debian packages use thin LTO, so we can override it to speed up the build
+CMAKE_FLAGS='-DENABLE_THINLTO=' ./docker/packager/packager --package-type=deb --output-dir "./$(git rev-parse --show-cdup)/build_results"
+```

From 629af21701df814b583aacd5e241cb95963842b7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 05:52:47 +0300
Subject: [PATCH 38/66] Update build.md

---
 docs/en/development/build.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/development/build.md b/docs/en/development/build.md
index acdde7b9245..5cbf851b785 100644
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@@ -44,7 +44,7 @@ sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 
 For other Linux distributions - check the availability of LLVM's [prebuild packages](https://releases.llvm.org/download.html).
 
-As of August 2023, clang-16 or higher will work.
+As of March 2024, clang-17 or higher will work.
 GCC as a compiler is not supported.
 To build with a specific Clang version:
 
@@ -54,8 +54,8 @@ to see what version you have installed before setting this environment variable.
 :::
 
 ``` bash
-export CC=clang-17
-export CXX=clang++-17
+export CC=clang-18
+export CXX=clang++-18
 ```
 
 ### Checkout ClickHouse Sources {#checkout-clickhouse-sources}

From 43e0a1b7081b26e77cd734817d494f79c2d2b10d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 05:59:02 +0100
Subject: [PATCH 39/66] Remove nonsense from SQL/JSON

---
 src/Functions/FunctionSQLJSON.h | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/src/Functions/FunctionSQLJSON.h b/src/Functions/FunctionSQLJSON.h
index 0533f3d419a..3efa40df9be 100644
--- a/src/Functions/FunctionSQLJSON.h
+++ b/src/Functions/FunctionSQLJSON.h
@@ -26,6 +26,7 @@
 
 #include "config.h"
 
+
 namespace DB
 {
 namespace ErrorCodes
@@ -114,8 +115,6 @@ private:
 
 };
 
-class EmptyJSONStringSerializer{};
-
 
 class FunctionSQLJSONHelpers
 {
@@ -156,25 +155,11 @@ public:
                 throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument (JSONPath) must be constant string");
             }
 
-            const ColumnPtr & arg_jsonpath = json_path_column.column;
-            const auto * arg_jsonpath_const = typeid_cast<const ColumnConst *>(arg_jsonpath.get());
-            const auto * arg_jsonpath_string = typeid_cast<const ColumnString *>(arg_jsonpath_const->getDataColumnPtr().get());
-
-            const ColumnPtr & arg_json = json_column.column;
-            const auto * col_json_const = typeid_cast<const ColumnConst *>(arg_json.get());
-            const auto * col_json_string
-                = typeid_cast<const ColumnString *>(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get());
-
-            /// Get data and offsets for 1 argument (JSONPath)
-            const ColumnString::Chars & chars_path = arg_jsonpath_string->getChars();
-            const ColumnString::Offsets & offsets_path = arg_jsonpath_string->getOffsets();
-
             /// Prepare to parse 1 argument (JSONPath)
-            const char * query_begin = reinterpret_cast<const char *>(&chars_path[0]);
-            const char * query_end = query_begin + offsets_path[0] - 1;
+            String query = typeid_cast<const ColumnConst &>(*json_path_column.column).getValue<String>();
 
-            /// Tokenize query
-            Tokens tokens(query_begin, query_end);
+            /// Tokenize the query
+            Tokens tokens(query.data(), query.data() + query.size());
             /// Max depth 0 indicates that depth is not limited
             IParser::Pos token_iterator(tokens, parse_depth);
 
@@ -188,10 +173,6 @@ public:
                 throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unable to parse JSONPath");
             }
 
-            /// Get data and offsets for 2 argument (JSON)
-            const ColumnString::Chars & chars_json = col_json_string->getChars();
-            const ColumnString::Offsets & offsets_json = col_json_string->getOffsets();
-
             JSONParser json_parser;
             using Element = typename JSONParser::Element;
             Element document;
@@ -200,10 +181,9 @@ public:
             /// Parse JSON for every row
             Impl impl;
             GeneratorJSONPath<JSONParser> generator_json_path(res);
-            for (const auto i : collections::range(0, input_rows_count))
+            for (size_t i = 0; i < input_rows_count; ++i)
             {
-                std::string_view json{
-                    reinterpret_cast<const char *>(&chars_json[offsets_json[i - 1]]), offsets_json[i] - offsets_json[i - 1] - 1};
+                std::string_view json = json_column.column->getDataAt(i).toView();
                 document_ok = json_parser.parse(json, document);
 
                 bool added_to_column = false;

From 5e43b733b2467d398b1ff85e8940508bb2b98565 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 06:00:32 +0100
Subject: [PATCH 40/66] Add a test

---
 tests/queries/0_stateless/03003_sql_json_nonsense.reference | 1 +
 tests/queries/0_stateless/03003_sql_json_nonsense.sql       | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tests/queries/0_stateless/03003_sql_json_nonsense.reference
 create mode 100644 tests/queries/0_stateless/03003_sql_json_nonsense.sql

diff --git a/tests/queries/0_stateless/03003_sql_json_nonsense.reference b/tests/queries/0_stateless/03003_sql_json_nonsense.reference
new file mode 100644
index 00000000000..8b137891791
--- /dev/null
+++ b/tests/queries/0_stateless/03003_sql_json_nonsense.reference
@@ -0,0 +1 @@
+
diff --git a/tests/queries/0_stateless/03003_sql_json_nonsense.sql b/tests/queries/0_stateless/03003_sql_json_nonsense.sql
new file mode 100644
index 00000000000..9b7beb42cf3
--- /dev/null
+++ b/tests/queries/0_stateless/03003_sql_json_nonsense.sql
@@ -0,0 +1 @@
+SELECT JSON_QUERY('{"x":1}', '$[\'hello\']', materialize(toLowCardinality('x')));

From 53e0d01d5e2909ee1370bcbeab93b4b8cfd3c26a Mon Sep 17 00:00:00 2001
From: beetelbrox <9376816+Beetelbrox@users.noreply.github.com>
Date: Mon, 4 Mar 2024 10:28:27 +0100
Subject: [PATCH 41/66] Do not send metadata headers on unsupported
 multipartoperations

---
 src/IO/S3/Requests.cpp | 15 +++++++++++++++
 src/IO/S3/Requests.h   | 19 ++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/IO/S3/Requests.cpp b/src/IO/S3/Requests.cpp
index 56d2e44a2c4..a4e61987bdf 100644
--- a/src/IO/S3/Requests.cpp
+++ b/src/IO/S3/Requests.cpp
@@ -52,6 +52,20 @@ Aws::Http::HeaderValueCollection CopyObjectRequest::GetRequestSpecificHeaders()
     return headers;
 }
 
+void CompleteMultipartUploadRequest::SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue)
+{
+    // S3's CompleteMultipartUpload doesn't support metadata headers so we skip adding them
+    if(!headerName.starts_with("x-amz-meta-"))
+        Model::CompleteMultipartUploadRequest::SetAdditionalCustomHeaderValue(headerName, headerValue);
+}
+
+void UploadPartRequest::SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue)
+{
+    // S3's UploadPart doesn't support metadata headers so we skip adding them
+    if(!headerName.starts_with("x-amz-meta-"))
+        Model::UploadPartRequest::SetAdditionalCustomHeaderValue(headerName, headerValue);
+}
+
 Aws::String ComposeObjectRequest::SerializePayload() const
 {
     if (component_names.empty())
@@ -70,6 +84,7 @@ Aws::String ComposeObjectRequest::SerializePayload() const
     return payload_doc.ConvertToString();
 }
 
+
 void ComposeObjectRequest::AddQueryStringParameters(Aws::Http::URI & /*uri*/) const
 {
 }
diff --git a/src/IO/S3/Requests.h b/src/IO/S3/Requests.h
index bfb94a5a67e..e02ccd8d79e 100644
--- a/src/IO/S3/Requests.h
+++ b/src/IO/S3/Requests.h
@@ -107,12 +107,21 @@ using ListObjectsV2Request = ExtendedRequest<Model::ListObjectsV2Request>;
 using ListObjectsRequest = ExtendedRequest<Model::ListObjectsRequest>;
 using GetObjectRequest = ExtendedRequest<Model::GetObjectRequest>;
 
-using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest>;
-using CompleteMultipartUploadRequest = ExtendedRequest<Model::CompleteMultipartUploadRequest>;
-using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest>;
-using UploadPartRequest = ExtendedRequest<Model::UploadPartRequest>;
-using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest>;
+class UploadPartRequest : public ExtendedRequest<Model::UploadPartRequest>
+{
+public:
+    void SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue) override;
+};
 
+class CompleteMultipartUploadRequest : public ExtendedRequest<Model::CompleteMultipartUploadRequest>
+{
+public:
+    void SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue) override;
+};
+
+using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest>;
+using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest>;
+using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest>;
 using PutObjectRequest = ExtendedRequest<Model::PutObjectRequest>;
 using DeleteObjectRequest = ExtendedRequest<Model::DeleteObjectRequest>;
 using DeleteObjectsRequest = ExtendedRequest<Model::DeleteObjectsRequest>;

From 229945cdd2c1544f7eb6bdafa09f9da67dc8d01b Mon Sep 17 00:00:00 2001
From: beetelbrox <9376816+Beetelbrox@users.noreply.github.com>
Date: Mon, 4 Mar 2024 10:32:45 +0100
Subject: [PATCH 42/66] Fix whitespace

---
 src/IO/S3/Requests.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/IO/S3/Requests.h b/src/IO/S3/Requests.h
index e02ccd8d79e..7b4c3698f10 100644
--- a/src/IO/S3/Requests.h
+++ b/src/IO/S3/Requests.h
@@ -122,6 +122,7 @@ public:
 using CreateMultipartUploadRequest = ExtendedRequest<Model::CreateMultipartUploadRequest>;
 using AbortMultipartUploadRequest = ExtendedRequest<Model::AbortMultipartUploadRequest>;
 using UploadPartCopyRequest = ExtendedRequest<Model::UploadPartCopyRequest>;
+
 using PutObjectRequest = ExtendedRequest<Model::PutObjectRequest>;
 using DeleteObjectRequest = ExtendedRequest<Model::DeleteObjectRequest>;
 using DeleteObjectsRequest = ExtendedRequest<Model::DeleteObjectsRequest>;

From 83c1c537d582dd5ebd461623f7bf1ff427b27e77 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 4 Mar 2024 10:48:50 +0100
Subject: [PATCH 43/66] Execute requests in order

---
 src/Coordination/KeeperStateMachine.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp
index e87ef037285..0c398a0d549 100644
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@@ -440,10 +440,11 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
         }
 
         ProfileEvents::increment(ProfileEvents::KeeperCommits);
-        keeper_context->setLastCommitIndex(log_idx);
 
         if (commit_callback)
             commit_callback(log_idx, *request_for_session);
+
+        keeper_context->setLastCommitIndex(log_idx);
     }
     catch (...)
     {

From e8f4e4eb772cf7576d4307af7e5e4e84c8600904 Mon Sep 17 00:00:00 2001
From: beetelbrox <9376816+Beetelbrox@users.noreply.github.com>
Date: Mon, 4 Mar 2024 12:54:55 +0100
Subject: [PATCH 44/66] Fix formatting

---
 src/IO/S3/Requests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/IO/S3/Requests.cpp b/src/IO/S3/Requests.cpp
index a4e61987bdf..50ed2e21bfc 100644
--- a/src/IO/S3/Requests.cpp
+++ b/src/IO/S3/Requests.cpp
@@ -55,14 +55,14 @@ Aws::Http::HeaderValueCollection CopyObjectRequest::GetRequestSpecificHeaders()
 void CompleteMultipartUploadRequest::SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue)
 {
     // S3's CompleteMultipartUpload doesn't support metadata headers so we skip adding them
-    if(!headerName.starts_with("x-amz-meta-"))
+    if (!headerName.starts_with("x-amz-meta-"))
         Model::CompleteMultipartUploadRequest::SetAdditionalCustomHeaderValue(headerName, headerValue);
 }
 
 void UploadPartRequest::SetAdditionalCustomHeaderValue(const Aws::String& headerName, const Aws::String& headerValue)
 {
     // S3's UploadPart doesn't support metadata headers so we skip adding them
-    if(!headerName.starts_with("x-amz-meta-"))
+    if (!headerName.starts_with("x-amz-meta-"))
         Model::UploadPartRequest::SetAdditionalCustomHeaderValue(headerName, headerValue);
 }
 

From f212c9c2302727e6f3732eaf66ae2d438d4cdd58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 4 Mar 2024 13:14:13 +0100
Subject: [PATCH 45/66] Fix crash in arrayEnumerateRanked

---
 src/Functions/array/arrayEnumerateRanked.cpp  | 135 ++++++++++--------
 src/Functions/array/arrayEnumerateRanked.h    |  22 +--
 .../0_stateless/00909_arrayEnumerateUniq.sql  |  18 +--
 .../03003_arrayEnumerate_crash.reference      |   0
 .../03003_arrayEnumerate_crash.sql            |   2 +
 5 files changed, 101 insertions(+), 76 deletions(-)
 create mode 100644 tests/queries/0_stateless/03003_arrayEnumerate_crash.reference
 create mode 100644 tests/queries/0_stateless/03003_arrayEnumerate_crash.sql

diff --git a/src/Functions/array/arrayEnumerateRanked.cpp b/src/Functions/array/arrayEnumerateRanked.cpp
index dd597d607dc..69d8954bfcf 100644
--- a/src/Functions/array/arrayEnumerateRanked.cpp
+++ b/src/Functions/array/arrayEnumerateRanked.cpp
@@ -1,8 +1,8 @@
-#include <algorithm>
 #include <Columns/ColumnConst.h>
+#include <Functions/array/arrayEnumerateRanked.h>
 #include <Common/assert_cast.h>
-#include "arrayEnumerateRanked.h"
 
+#include <algorithm>
 
 namespace DB
 {
@@ -12,88 +12,105 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
-ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments)
+ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments, const char * function_name)
 {
     const size_t num_arguments = arguments.size();
+    if (!num_arguments)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Missing arguments for function arrayEnumerateUniqRanked");
 
     DepthType clear_depth = 1;
-    DepthTypes depths;
+    size_t i = 0;
+    if (const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(arguments[0].type.get()); !type_array)
+    {
+        /// If the first argument is not an array, it must be a const positive and non zero number
+        const auto & depth_column = arguments[i].column;
+        if (!depth_column || !isColumnConst(*depth_column))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "First argument of {} must be Const(UInt64)", function_name);
+        Field f = assert_cast<const ColumnConst &>(*depth_column).getField();
+        if (f.getType() != Field::Types::UInt64 || f.safeGet<UInt64>() == 0)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "First argument of {} must be a positive integer", function_name);
 
-    /// function signature is the following:
-    /// f(c0, arr1, c1, arr2, c2, ...)
-    ///
-    /// c0 is something called "clear_depth" here.
+        clear_depth = static_cast<DepthType>(f.safeGet<UInt64>());
+        i++;
+    }
+
+
+    /// The rest of the arguments must be in the shape: arr1, c1, arr2, c2, ...
     /// cN... - how deep to look into the corresponding arrN, (called "depths" here)
-    ///   may be omitted - then it means "look at the full depth".
-
-    size_t array_num = 0;
-    DepthType prev_array_depth = 0;
-    for (size_t i = 0; i < num_arguments; ++i)
+    /// may be omitted - then it means "look at the full depth"
+    DepthTypes depths;
+    for (; i < num_arguments; i++)
     {
         const DataTypePtr & type = arguments[i].type;
-        const DataTypeArray * type_array = typeid_cast<const DataTypeArray *>(type.get());
+        const DataTypeArray * current_type_array = typeid_cast<const DataTypeArray *>(type.get());
+        if (!current_type_array)
+            throw Exception(
+                ErrorCodes::BAD_ARGUMENTS,
+                "Incorrect argument {} type of function {}. Expected an Array, got {}",
+                i + 1,
+                function_name,
+                type->getName());
 
-        if (type_array)
+        if (i == num_arguments - 1)
         {
-            if (depths.size() < array_num && prev_array_depth)
-                depths.emplace_back(prev_array_depth);
-
-            prev_array_depth = static_cast<DepthType>(type_array->getNumberOfDimensions());
-            ++array_num;
+            depths.emplace_back(current_type_array->getNumberOfDimensions());
         }
         else
         {
-            const auto & depth_column = arguments[i].column;
-
-            if (depth_column && isColumnConst(*depth_column))
+            const DataTypeArray * next_argument_array = typeid_cast<const DataTypeArray *>(arguments[i + 1].type.get());
+            if (next_argument_array)
             {
-                UInt64 value = assert_cast<const ColumnConst &>(*depth_column).getValue<UInt64>();
-                if (!value)
-                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                                    "Incorrect arguments for function arrayEnumerateUniqRanked "
-                                    "or arrayEnumerateDenseRanked: depth ({}) cannot be less or equal 0.",
-                                    std::to_string(value));
-
-                if (i == 0)
-                {
-                    clear_depth = static_cast<DepthType>(value);
-                }
-                else
-                {
-                    if (depths.size() >= array_num)
-                        throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                                        "Incorrect arguments for function arrayEnumerateUniqRanked "
-                                        "or arrayEnumerateDenseRanked: depth ({}) for missing array.",
-                                        std::to_string(value));
-                    if (value > prev_array_depth)
-                        throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                                        "Arguments for function arrayEnumerateUniqRanked/arrayEnumerateDenseRanked incorrect: depth={}"
-                                        " for array with depth={}.",
-                                        std::to_string(value), std::to_string(prev_array_depth));
-
-                    depths.emplace_back(value);
-                }
+                depths.emplace_back(current_type_array->getNumberOfDimensions());
+            }
+            else
+            {
+                i++;
+                /// The following argument is not array, so it must be a const positive integer with the depth
+                const auto & depth_column = arguments[i].column;
+                if (!depth_column || !isColumnConst(*depth_column))
+                    throw Exception(
+                        ErrorCodes::BAD_ARGUMENTS,
+                        "Incorrect argument {} type of function {}. Expected an Array or Const(UInt64), got {}",
+                        i + 1,
+                        function_name,
+                        arguments[i].type->getName());
+                Field f = assert_cast<const ColumnConst &>(*depth_column).getField();
+                if (f.getType() != Field::Types::UInt64 || f.safeGet<UInt64>() == 0)
+                    throw Exception(
+                        ErrorCodes::BAD_ARGUMENTS,
+                        "Incorrect argument {} of function {}. Expected a positive integer",
+                        i + 1,
+                        function_name);
+                UInt64 value = f.safeGet<UInt64>();
+                UInt64 prev_array_depth = current_type_array->getNumberOfDimensions();
+                if (value > prev_array_depth)
+                    throw Exception(
+                        ErrorCodes::BAD_ARGUMENTS,
+                        "Incorrect argument {} of function {}. Required depth '{}' is larger than the array depth ({})",
+                        i + 1,
+                        function_name,
+                        value,
+                        prev_array_depth);
+                depths.emplace_back(value);
             }
         }
     }
 
-    if (depths.size() < array_num)
-        depths.emplace_back(prev_array_depth);
-
     if (depths.empty())
-        throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Incorrect arguments for function arrayEnumerateUniqRanked or arrayEnumerateDenseRanked: "
-                        "at least one array should be passed.");
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS, "Incorrect arguments for function {}: At least one array should be passed", function_name);
 
     DepthType max_array_depth = 0;
     for (auto depth : depths)
         max_array_depth = std::max(depth, max_array_depth);
 
     if (clear_depth > max_array_depth)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                        "Incorrect arguments for function arrayEnumerateUniqRanked or arrayEnumerateDenseRanked: "
-                        "clear_depth ({}) can't be larger than max_array_depth ({}).",
-                        std::to_string(clear_depth), std::to_string(max_array_depth));
+        throw Exception(
+            ErrorCodes::BAD_ARGUMENTS,
+            "Incorrect arguments for function {}: clear_depth ({}) can't be larger than max_array_depth ({})",
+            function_name,
+            clear_depth,
+            max_array_depth);
 
     return {clear_depth, depths, max_array_depth};
 }
diff --git a/src/Functions/array/arrayEnumerateRanked.h b/src/Functions/array/arrayEnumerateRanked.h
index 1a920260906..04fa305368d 100644
--- a/src/Functions/array/arrayEnumerateRanked.h
+++ b/src/Functions/array/arrayEnumerateRanked.h
@@ -84,7 +84,7 @@ struct ArraysDepths
 };
 
 /// Return depth info about passed arrays
-ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments);
+ArraysDepths getArraysDepths(const ColumnsWithTypeAndName & arguments, const char * function_name);
 
 template <typename Derived>
 class FunctionArrayEnumerateRankedExtended : public IFunction
@@ -105,7 +105,7 @@ public:
                 "Number of arguments for function {} doesn't match: passed {}, should be at least 1.",
                 getName(), arguments.size());
 
-        const ArraysDepths arrays_depths = getArraysDepths(arguments);
+        const ArraysDepths arrays_depths = getArraysDepths(arguments, Derived::name);
 
         /// Return type is the array of the depth as the maximum effective depth of arguments, containing UInt32.
 
@@ -154,7 +154,7 @@ ColumnPtr FunctionArrayEnumerateRankedExtended<Derived>::executeImpl(
     Columns array_holders;
     ColumnPtr offsets_column;
 
-    const ArraysDepths arrays_depths = getArraysDepths(arguments);
+    const ArraysDepths arrays_depths = getArraysDepths(arguments, Derived::name);
 
     /// If the column is Array - return it. If the const Array - materialize it, keep ownership and return.
     auto get_array_column = [&](const auto & column) -> const DB::ColumnArray *
@@ -213,17 +213,23 @@ ColumnPtr FunctionArrayEnumerateRankedExtended<Derived>::executeImpl(
             {
                 if (*offsets_by_depth[col_depth] != array->getOffsets())
                 {
-                    throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
-                                "Lengths and effective depths of all arrays passed to {} must be equal.", getName());
+                    throw Exception(
+                        ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
+                        "Lengths and effective depths of all arrays passed to {} must be equal",
+                        getName());
                 }
             }
         }
 
         if (col_depth < arrays_depths.depths[array_num])
         {
-            throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
-                            "{}: Passed array number {} depth ({}) is more than the actual array depth ({}).",
-                            getName(), array_num, std::to_string(arrays_depths.depths[array_num]), col_depth);
+            throw Exception(
+                ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
+                "{}: Passed array number {} depth ({}) is more than the actual array depth ({})",
+                getName(),
+                array_num,
+                std::to_string(arrays_depths.depths[array_num]),
+                col_depth);
         }
 
         auto * array_data = &array->getData();
diff --git a/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql b/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql
index 33097c99272..0bdb338e9d2 100644
--- a/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql
+++ b/tests/queries/0_stateless/00909_arrayEnumerateUniq.sql
@@ -181,15 +181,15 @@ SELECT arrayEnumerateUniqRanked([1,2], 1, 2); -- { serverError 36 }
 SELECT arrayEnumerateUniqRanked([1,2], 1, 3, 4, 5); -- { serverError 36 }
 SELECT arrayEnumerateUniqRanked([1,2], 1, 3, [4], 5); -- { serverError 36 }
 SELECT arrayEnumerateDenseRanked([[[[[[[[[[42]]]]]]]]]]);
-SELECT arrayEnumerateUniqRanked('wat', [1,2]); -- { serverError 170 }
-SELECT arrayEnumerateUniqRanked(1, [1,2], 'boom'); -- { serverError 170 }
-SELECT arrayEnumerateDenseRanked(['\0'], -8363126); -- { serverError 170 }
-SELECT arrayEnumerateDenseRanked(-10, ['\0'], -8363126); -- { serverError 170 }
-SELECT arrayEnumerateDenseRanked(1, ['\0'], -8363126); -- { serverError 170 }
-SELECT arrayEnumerateDenseRanked(-101, ['\0']); -- { serverError 170 }
-SELECT arrayEnumerateDenseRanked(1.1, [10,20,10,30]); -- { serverError 170 }
-SELECT arrayEnumerateDenseRanked([10,20,10,30], 0.4); -- { serverError 170 }
-SELECT arrayEnumerateDenseRanked([10,20,10,30], 1.8); -- { serverError 170 }
+SELECT arrayEnumerateUniqRanked('wat', [1,2]); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateUniqRanked(1, [1,2], 'boom'); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateDenseRanked(['\0'], -8363126); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateDenseRanked(-10, ['\0'], -8363126); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateDenseRanked(1, ['\0'], -8363126); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateDenseRanked(-101, ['\0']); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateDenseRanked(1.1, [10,20,10,30]); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateDenseRanked([10,20,10,30], 0.4); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateDenseRanked([10,20,10,30], 1.8); -- { serverError BAD_ARGUMENTS }
 SELECT arrayEnumerateUniqRanked(1, [], 1000000000); -- { serverError 36 }
 
 
diff --git a/tests/queries/0_stateless/03003_arrayEnumerate_crash.reference b/tests/queries/0_stateless/03003_arrayEnumerate_crash.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/03003_arrayEnumerate_crash.sql b/tests/queries/0_stateless/03003_arrayEnumerate_crash.sql
new file mode 100644
index 00000000000..21102ddbb6a
--- /dev/null
+++ b/tests/queries/0_stateless/03003_arrayEnumerate_crash.sql
@@ -0,0 +1,2 @@
+SELECT arrayEnumerateUniqRanked(arrayEnumerateUniqRanked([[1, 2, 3], [2, 2, 1], [3]]), materialize(1 AS x) OR toLowCardinality(-9223372036854775808)); -- { serverError BAD_ARGUMENTS }
+SELECT arrayEnumerateUniqRanked([[1, 2, 3], [2, 2, 1], [3]], number) FROM numbers(10); -- { serverError BAD_ARGUMENTS }

From ddf0dd7eb8784b3fb49ccf948707fe509c5f45ee Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Mon, 4 Mar 2024 12:19:47 +0000
Subject: [PATCH 46/66] Fix crash when using input() in INSERT SELECT JOIN

---
 src/Interpreters/Context.cpp                          |  2 +-
 .../03005_input_function_in_join.reference            |  0
 .../0_stateless/03005_input_function_in_join.sql      | 11 +++++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/03005_input_function_in_join.reference
 create mode 100644 tests/queries/0_stateless/03005_input_function_in_join.sql

diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 8304a876fb1..7f51f41ecae 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -1790,7 +1790,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const
         }
 
         uint64_t use_structure_from_insertion_table_in_table_functions = getSettingsRef().use_structure_from_insertion_table_in_table_functions;
-        if (use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint() && hasInsertionTable())
+        if (select_query_hint && use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint() && hasInsertionTable())
         {
             const auto & insert_columns = DatabaseCatalog::instance()
                                               .getTable(getInsertionTable(), shared_from_this())
diff --git a/tests/queries/0_stateless/03005_input_function_in_join.reference b/tests/queries/0_stateless/03005_input_function_in_join.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/03005_input_function_in_join.sql b/tests/queries/0_stateless/03005_input_function_in_join.sql
new file mode 100644
index 00000000000..85f275ae11f
--- /dev/null
+++ b/tests/queries/0_stateless/03005_input_function_in_join.sql
@@ -0,0 +1,11 @@
+create table test (a Int8) engine = MergeTree order by tuple();
+INSERT INTO test
+SELECT * FROM (
+ SELECT number
+ FROM system.numbers
+ LIMIT 10
+) AS x
+INNER JOIN input('a UInt64') AS y ON x.number = y.a
+Format CSV 42; -- {serverError INVALID_USAGE_OF_INPUT}
+drop table test;
+

From 63e40203d1627669576a18a83d5f0dbf139e86cd Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Thu, 29 Feb 2024 22:00:29 +0100
Subject: [PATCH 47/66] Use python zipfile to have x-platform idempotent lambda
 packages

---
 .../build_and_deploy_archive.sh               | 54 +++++++++++--------
 1 file changed, 33 insertions(+), 21 deletions(-)

diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh
index 6ba0987010a..aa0ff912567 100644
--- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh
+++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh
@@ -29,34 +29,46 @@ if [ -e "$PACKAGE.zip" ] && [ -z "$FORCE" ]; then
   [ -n "$REBUILD" ] || exit 0
 fi
 
+docker_cmd=(
+  docker run -i --net=host --rm --user="${UID}" -e HOME=/tmp --entrypoint=/bin/bash
+  --volume="${WORKDIR}/..:/ci" --workdir="/ci/${DIR_NAME}" "${DOCKER_IMAGE}"
+)
 rm -rf "$PACKAGE" "$PACKAGE".zip
 mkdir "$PACKAGE"
 cp app.py "$PACKAGE"
 if [ -f requirements.txt ]; then
   VENV=lambda-venv
   rm -rf "$VENV"
-  docker run --net=host --rm --user="${UID}" -e HOME=/tmp --entrypoint=/bin/bash \
-    --volume="${WORKDIR}/..:/ci" --workdir="/ci/${DIR_NAME}" "${DOCKER_IMAGE}" \
-    -exc "
-      '$PY_EXEC' -m venv '$VENV' &&
-      source '$VENV/bin/activate' &&
-      pip install -r requirements.txt &&
-      # To have consistent pyc files
-      find '$VENV/lib' -name '*.pyc' -delete
-      find '$VENV/lib' ! -type d -exec touch -t 201212121212 {} +
-      python -m compileall
-    "
-  cp -rT "$VENV/lib/$PY_EXEC/site-packages/" "$PACKAGE"
-  rm -r "$PACKAGE"/{pip,pip-*,setuptools,setuptools-*}
-  # zip stores metadata about timestamps
-  find "$PACKAGE" ! -type d -exec touch -t 201212121212 {} +
+  "${docker_cmd[@]}" -ex <<EOF
+    '$PY_EXEC' -m venv '$VENV' &&
+    source '$VENV/bin/activate' &&
+    pip install -r requirements.txt &&
+    # To have consistent pyc files
+    find '$VENV/lib' -name '*.pyc' -delete
+    find '$VENV/lib' ! -type d -exec touch -t 201212121212 {} +
+    python -m compileall
+    cp -rT '$VENV/lib/$PY_EXEC/site-packages/' '$PACKAGE'
+    rm -r '$PACKAGE'/{pip,pip-*,setuptools,setuptools-*}
+EOF
 fi
-(
-  export LC_ALL=c
-  cd "$PACKAGE"
-  # zip uses random files order by default, so we sort the files alphabetically
-  find . ! -type d -print0 | sort -z | tr '\0' '\n' | zip -XD -0 ../"$PACKAGE".zip --names-stdin
-)
+# Create zip archive via python zipfile to have it cross-platform
+"${docker_cmd[@]}" -ex <<EOF
+cd '$PACKAGE'
+find ! -type d -exec touch -t 201212121212 {} +
+
+python <<'EOP'
+import zipfile
+import os
+files_path = []
+for root, _, files in os.walk('.'):
+    files_path.extend(os.path.join(root, file) for file in files)
+# persistent file order
+files_path.sort()
+with zipfile.ZipFile('../$PACKAGE.zip', 'w') as zf:
+    for file in files_path:
+        zf.write(file)
+EOP
+EOF
 
 ECHO=()
 if [ -n "$DRY_RUN" ]; then

From 1443e490ea5287175f031eaee567fe5024cd7306 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Mon, 4 Mar 2024 11:49:45 +0100
Subject: [PATCH 48/66] Remove python bytecode, make consistent
 file-permissions

It's impossible to have persistent pyc files
Each time they are built with different content, for example:

> cmp -bl lambda-package*/charset_normalizer/__pycache__/constant.cpython-310.pyc
15582   6 ^F     4 ^D
15583 164 t    155 m
15584 141 a    142 b
15586 164 t    163 s
15587 151 i    332 M-Z
15588 163 s      6 ^F
15589 332 M-Z  164 t
15590   4 ^D   141 a
15591 155 m    143 c
15592 142 b    164 t
15593 143 c    151 i
17425  74 <     75 =
17428  76 >     46 &
17429 332 M-Z  372 M-z
17431 173 {     55 -
17434  75 =    174 |
17437  57 /     72 :
17440  54 ,     73 ;
17441 372 M-z  332 M-Z
17443 174 |    175 }
17446  55 -     54 ,
17447 372 M-z  332 M-Z
17449  46 &    173 {
17452  72 :     76 >
17455  42 "     74 <
17458  73 ;    133 [
17461 135 ]     42 "
17464 133 [    135 ]
17465 332 M-Z  372 M-z
17467 175 }     57 /
17503 332 M-Z  162 r
17504   5 ^E   130 X
17505 152 j      0 ^@
17506 157 o      0 ^@
17507 150 h      0 ^@
17508 141 a    332 M-Z
17509 142 b      5 ^E
17510 162 r    152 j
17511 130 X    157 o
17512   0 ^@   150 h
17513   0 ^@   141 a
17514   0 ^@   142 b
17536   5 ^E     2 ^B
17537 143 c    150 h
17538 160 p    172 z
17539  71 9    332 M-Z
17540  65 5      5 ^E
17541  60 0    143 c
17542 332 M-Z  160 p
17543   2 ^B    71 9
17544 150 h     65 5
17545 172 z     60 0
---
 tests/ci/team_keys_lambda/build_and_deploy_archive.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh
index aa0ff912567..b72bce4a677 100644
--- a/tests/ci/team_keys_lambda/build_and_deploy_archive.sh
+++ b/tests/ci/team_keys_lambda/build_and_deploy_archive.sh
@@ -45,10 +45,9 @@ if [ -f requirements.txt ]; then
     pip install -r requirements.txt &&
     # To have consistent pyc files
     find '$VENV/lib' -name '*.pyc' -delete
-    find '$VENV/lib' ! -type d -exec touch -t 201212121212 {} +
-    python -m compileall
     cp -rT '$VENV/lib/$PY_EXEC/site-packages/' '$PACKAGE'
     rm -r '$PACKAGE'/{pip,pip-*,setuptools,setuptools-*}
+    chmod 0777 -R '$PACKAGE'
 EOF
 fi
 # Create zip archive via python zipfile to have it cross-platform

From 7a851bece3f2e7ce2dfe6e819da38975fa5776eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 4 Mar 2024 14:15:15 +0100
Subject: [PATCH 49/66] Update tzdata to 2024a

---
 contrib/cctz                | 2 +-
 src/Core/SettingsFields.cpp | 9 +++++++++
 src/Core/SettingsFields.h   | 8 +-------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/contrib/cctz b/contrib/cctz
index 8529bcef5cd..7918cb7afe8 160000
--- a/contrib/cctz
+++ b/contrib/cctz
@@ -1 +1 @@
-Subproject commit 8529bcef5cd996b7c0f4d7475286b76b5d126c4c
+Subproject commit 7918cb7afe82e53428e39a045a437fdfd4f3df47
diff --git a/src/Core/SettingsFields.cpp b/src/Core/SettingsFields.cpp
index e514ced6f68..001d3e09dc9 100644
--- a/src/Core/SettingsFields.cpp
+++ b/src/Core/SettingsFields.cpp
@@ -8,7 +8,9 @@
 #include <IO/ReadHelpers.h>
 #include <IO/ReadBufferFromString.h>
 #include <IO/WriteHelpers.h>
+
 #include <boost/algorithm/string/predicate.hpp>
+#include <cctz/time_zone.h>
 
 #include <cmath>
 
@@ -544,6 +546,13 @@ void SettingFieldTimezone::readBinary(ReadBuffer & in)
     *this = std::move(str);
 }
 
+void SettingFieldTimezone::validateTimezone(const std::string & tz_str)
+{
+    cctz::time_zone validated_tz;
+    if (!tz_str.empty() && !cctz::load_time_zone(tz_str, &validated_tz))
+        throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Invalid time zone: {}", tz_str);
+}
+
 String SettingFieldCustom::toString() const
 {
     return value.dump();
diff --git a/src/Core/SettingsFields.h b/src/Core/SettingsFields.h
index 22c1cf8a267..452f3f149ab 100644
--- a/src/Core/SettingsFields.h
+++ b/src/Core/SettingsFields.h
@@ -6,7 +6,6 @@
 #include <Core/Field.h>
 #include <Core/MultiEnum.h>
 #include <boost/range/adaptor/map.hpp>
-#include <cctz/time_zone.h>
 #include <chrono>
 #include <unordered_map>
 #include <string_view>
@@ -608,12 +607,7 @@ struct SettingFieldTimezone
     void readBinary(ReadBuffer & in);
 
 private:
-    void validateTimezone(const std::string & tz_str)
-    {
-        cctz::time_zone validated_tz;
-        if (!tz_str.empty() && !cctz::load_time_zone(tz_str, &validated_tz))
-            throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Invalid time zone: {}", tz_str);
-    }
+    void validateTimezone(const std::string & tz_str);
 };
 
 /// Can keep a value of any type. Used for user-defined settings.

From 9f5fe176ada41ba7bf72f19df53d79f65290a401 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 4 Mar 2024 14:41:01 +0100
Subject: [PATCH 50/66] Catch exceptions on finalize

---
 src/Server/InterserverIOHTTPHandler.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp
index c41d68bab02..d2e0ed93667 100644
--- a/src/Server/InterserverIOHTTPHandler.cpp
+++ b/src/Server/InterserverIOHTTPHandler.cpp
@@ -117,7 +117,17 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe
         if (auto [message, success] = checkAuthentication(request); success)
         {
             processQuery(request, response, used_output);
-            used_output.out->finalize();
+
+            try
+            {
+                used_output.out->finalize();
+            }
+            catch (...)
+            {
+                tryLogCurrentException(log, "Failed to finalize response write buffer");
+                return;
+            }
+
             LOG_DEBUG(log, "Done processing query");
         }
         else

From d7de634123d58e475f8023b98245ded4c9eaf66a Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Mon, 4 Mar 2024 14:37:49 +0000
Subject: [PATCH 51/66] Update test

---
 .../0_stateless/03005_input_function_in_join.reference     | 1 +
 tests/queries/0_stateless/03005_input_function_in_join.sql | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/03005_input_function_in_join.reference b/tests/queries/0_stateless/03005_input_function_in_join.reference
index e69de29bb2d..0cfbf08886f 100644
--- a/tests/queries/0_stateless/03005_input_function_in_join.reference
+++ b/tests/queries/0_stateless/03005_input_function_in_join.reference
@@ -0,0 +1 @@
+2
diff --git a/tests/queries/0_stateless/03005_input_function_in_join.sql b/tests/queries/0_stateless/03005_input_function_in_join.sql
index 85f275ae11f..8a6b4a48a8d 100644
--- a/tests/queries/0_stateless/03005_input_function_in_join.sql
+++ b/tests/queries/0_stateless/03005_input_function_in_join.sql
@@ -1,11 +1,14 @@
+drop table if exists test;
 create table test (a Int8) engine = MergeTree order by tuple();
 INSERT INTO test
-SELECT * FROM (
+SELECT x.number FROM (
  SELECT number
  FROM system.numbers
  LIMIT 10
 ) AS x
 INNER JOIN input('a UInt64') AS y ON x.number = y.a
-Format CSV 42; -- {serverError INVALID_USAGE_OF_INPUT}
+Format CSV 2
+;
+select * from test;
 drop table test;
 

From 081ed8de2aaea770efc9670b43d7da07f746158f Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Mon, 4 Mar 2024 16:18:53 +0100
Subject: [PATCH 52/66] Use 64-bit capabilities if available

This will fix the following warning in dmesg:

    capability: warning: `clickhouse-serv' uses 32-bit capabilities (legacy support in use)

P.S. I'm not even sure that the fallback code is useful, since
_LINUX_CAPABILITY_VERSION_3 had been added long time ago, in Linux
2.6.26 (Released 13 July 2008)

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 src/Common/hasLinuxCapability.cpp | 39 +++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/Common/hasLinuxCapability.cpp b/src/Common/hasLinuxCapability.cpp
index bf236eb5c56..6a4570a498c 100644
--- a/src/Common/hasLinuxCapability.cpp
+++ b/src/Common/hasLinuxCapability.cpp
@@ -5,6 +5,8 @@
 #include <syscall.h>
 #include <unistd.h>
 #include <linux/capability.h>
+#include <cstdint>
+#include <base/types.h>
 #include <Common/Exception.h>
 
 
@@ -16,25 +18,48 @@ namespace ErrorCodes
     extern const int NETLINK_ERROR;
 }
 
-static __user_cap_data_struct getCapabilities()
+struct Capabilities
+{
+    UInt64 effective;
+    UInt64 permitted;
+    UInt64 inheritable;
+};
+
+static Capabilities getCapabilities()
 {
     /// See man getcap.
     __user_cap_header_struct request{};
-    request.version = _LINUX_CAPABILITY_VERSION_1; /// It's enough to check just single CAP_NET_ADMIN capability we are interested.
+    request.version = _LINUX_CAPABILITY_VERSION_3;
     request.pid = getpid();
 
-    __user_cap_data_struct response{};
+    Capabilities ret{};
+    __user_cap_data_struct response[2] = {};
 
     /// Avoid dependency on 'libcap'.
-    if (0 != syscall(SYS_capget, &request, &response))
-        throw ErrnoException(ErrorCodes::NETLINK_ERROR, "Cannot do 'capget' syscall");
+    if (0 == syscall(SYS_capget, &request, response))
+    {
+        ret.effective   = static_cast<UInt64>(response[1].effective) << 32   | response[0].effective;
+        ret.permitted   = static_cast<UInt64>(response[1].permitted) << 32   | response[0].permitted;
+        ret.inheritable = static_cast<UInt64>(response[1].inheritable) << 32 | response[0].inheritable;
+        return ret;
+    }
 
-    return response;
+    /// Does not supports V3, fallback to V1.
+    /// It's enough to check just single CAP_NET_ADMIN capability we are interested.
+    if (errno == EINVAL && 0 == syscall(SYS_capget, &request, response))
+    {
+        ret.effective = response[0].effective;
+        ret.permitted = response[0].permitted;
+        ret.inheritable = response[0].inheritable;
+        return ret;
+    }
+
+    throw ErrnoException(ErrorCodes::NETLINK_ERROR, "Cannot do 'capget' syscall");
 }
 
 bool hasLinuxCapability(int cap)
 {
-    static __user_cap_data_struct capabilities = getCapabilities();
+    static Capabilities capabilities = getCapabilities();
     return (1 << cap) & capabilities.effective;
 }
 

From fb8241c6519029882371b14d8ba0b1a61108c18d Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 4 Mar 2024 16:36:46 +0100
Subject: [PATCH 53/66] Better

---
 src/Server/InterserverIOHTTPHandler.cpp | 33 +++++++++++++------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp
index d2e0ed93667..28045380cd7 100644
--- a/src/Server/InterserverIOHTTPHandler.cpp
+++ b/src/Server/InterserverIOHTTPHandler.cpp
@@ -91,24 +91,35 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe
     used_output.out = std::make_shared<WriteBufferFromHTTPServerResponse>(
         response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout, write_event);
 
+    auto finalize_output = [&]
+    {
+        try
+        {
+            used_output.out->finalize();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, "Failed to finalize response write buffer");
+        }
+    };
+
     auto write_response = [&](const std::string & message)
     {
-        auto & out = *used_output.out;
         if (response.sent())
         {
-            out.finalize();
+            finalize_output();
             return;
         }
 
         try
         {
-            writeString(message, out);
-            out.finalize();
+            writeString(message, *used_output.out);
+            finalize_output();
         }
         catch (...)
         {
             tryLogCurrentException(log);
-            out.finalize();
+            finalize_output();
         }
     };
 
@@ -117,17 +128,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe
         if (auto [message, success] = checkAuthentication(request); success)
         {
             processQuery(request, response, used_output);
-
-            try
-            {
-                used_output.out->finalize();
-            }
-            catch (...)
-            {
-                tryLogCurrentException(log, "Failed to finalize response write buffer");
-                return;
-            }
-
+            finalize_output();
             LOG_DEBUG(log, "Done processing query");
         }
         else

From 0149b8893ad99d03bbc1ea6dd512c9b08648cfbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 4 Mar 2024 17:42:06 +0100
Subject: [PATCH 54/66] Include multiline logs in fuzzer fatal.log report

---
 docker/test/fuzzer/run-fuzzer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index b4376fe2409..7a0d2939cd3 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -386,7 +386,7 @@ if [ -f core.zst ]; then
     CORE_LINK='<a href="core.zst">core.zst</a>'
 fi
 
-rg --text -F '<Fatal>' server.log > fatal.log ||:
+sed -n '/<Fatal>/,/^$/p' s.log | rg "(^[^202])|<Fatal>" server.log > fatal.log ||:
 FATAL_LINK=''
 if [ -s fatal.log ]; then
     FATAL_LINK='<a href="fatal.log">fatal.log</a>'

From a7430004b3cbc9f9f5a7b712dd41e33e5b256126 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 04:33:15 +0100
Subject: [PATCH 55/66] Fix real time query profiler on AArch64

---
 base/glibc-compatibility/musl/aarch64/syscall.s | 2 ++
 programs/server/Server.cpp                      | 5 -----
 src/Interpreters/TraceCollector.cpp             | 1 -
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/base/glibc-compatibility/musl/aarch64/syscall.s b/base/glibc-compatibility/musl/aarch64/syscall.s
index 845986bf787..aadaea04ef5 100644
--- a/base/glibc-compatibility/musl/aarch64/syscall.s
+++ b/base/glibc-compatibility/musl/aarch64/syscall.s
@@ -2,6 +2,7 @@
 .hidden __syscall
 .type __syscall,%function
 __syscall:
+.cfi_startproc
 	uxtw x8,w0
 	mov x0,x1
 	mov x1,x2
@@ -12,3 +13,4 @@ __syscall:
 	mov x6,x7
 	svc 0
 	ret
+.cfi_endproc
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 93562d6df90..c45291ba52c 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1881,7 +1881,6 @@ try
         {
             total_memory_tracker.setSampleMaxAllocationSize(server_settings.total_memory_profiler_sample_max_allocation_size);
         }
-
     }
 #endif
 
@@ -1896,10 +1895,6 @@ try
         " when two different stack unwinding methods will interfere with each other.");
 #endif
 
-#if !defined(__x86_64__)
-    LOG_INFO(log, "Query Profiler and TraceCollector is only tested on x86_64. It also known to not work under qemu-user.");
-#endif
-
     if (!hasPHDRCache())
         LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created"
             " (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe).");
diff --git a/src/Interpreters/TraceCollector.cpp b/src/Interpreters/TraceCollector.cpp
index 1fe11be6090..8e9c397b7a1 100644
--- a/src/Interpreters/TraceCollector.cpp
+++ b/src/Interpreters/TraceCollector.cpp
@@ -6,7 +6,6 @@
 #include <IO/WriteBufferFromFileDescriptor.h>
 #include <IO/WriteHelpers.h>
 #include <Interpreters/TraceLog.h>
-#include <Poco/Logger.h>
 #include <Common/ProfileEvents.h>
 #include <Common/setThreadName.h>
 #include <Common/logger_useful.h>

From 6e579e6bfaa4e486ff3cdd2285992ff95163c212 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 4 Mar 2024 18:15:53 +0100
Subject: [PATCH 56/66] Use awk since it's much simpler

---
 docker/test/fuzzer/run-fuzzer.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index 7a0d2939cd3..9358e88e1e8 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -386,7 +386,8 @@ if [ -f core.zst ]; then
     CORE_LINK='<a href="core.zst">core.zst</a>'
 fi
 
-sed -n '/<Fatal>/,/^$/p' s.log | rg "(^[^202])|<Fatal>" server.log > fatal.log ||:
+# Keep all the lines in the paragraphs containing <Fatal> that either contain <Fatal> or don't start with 20... (year)
+sed -n '/<Fatal>/,/^$/p' s.log | awk '/<Fatal>/ || !/^20/' server.log > fatal.log ||:
 FATAL_LINK=''
 if [ -s fatal.log ]; then
     FATAL_LINK='<a href="fatal.log">fatal.log</a>'

From ff2882c7072ea2f08df67e77236f1890c6fcdc8d Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 4 Mar 2024 18:16:38 +0100
Subject: [PATCH 57/66] Fix query profiler on AArch64

---
 tests/queries/0_stateless/00974_query_profiler.sql             | 2 +-
 tests/queries/0_stateless/01092_memory_profiler.sql            | 2 +-
 tests/queries/0_stateless/01526_max_untracked_memory.sh        | 3 +--
 tests/queries/0_stateless/01569_query_profiler_big_query_id.sh | 3 +--
 tests/queries/0_stateless/02161_addressToLineWithInlines.sql   | 2 +-
 tests/queries/0_stateless/02252_jit_profile_events.sql         | 2 +-
 .../02818_memory_profiler_sample_min_max_allocation_size.sh    | 3 +--
 7 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/tests/queries/0_stateless/00974_query_profiler.sql b/tests/queries/0_stateless/00974_query_profiler.sql
index b697bd56800..24e4241b813 100644
--- a/tests/queries/0_stateless/00974_query_profiler.sql
+++ b/tests/queries/0_stateless/00974_query_profiler.sql
@@ -1,4 +1,4 @@
--- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-fasttest, no-cpu-aarch64
+-- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-fasttest
 -- Tag no-fasttest: Not sure why fail even in sequential mode. Disabled for now to make some progress.
 
 SET allow_introspection_functions = 1;
diff --git a/tests/queries/0_stateless/01092_memory_profiler.sql b/tests/queries/0_stateless/01092_memory_profiler.sql
index b69d3faf94e..3a04de650ce 100644
--- a/tests/queries/0_stateless/01092_memory_profiler.sql
+++ b/tests/queries/0_stateless/01092_memory_profiler.sql
@@ -1,4 +1,4 @@
--- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-parallel, no-fasttest, no-cpu-aarch64
+-- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-parallel, no-fasttest
 
 SET allow_introspection_functions = 1;
 
diff --git a/tests/queries/0_stateless/01526_max_untracked_memory.sh b/tests/queries/0_stateless/01526_max_untracked_memory.sh
index 45fdb314fb2..b2bad637422 100755
--- a/tests/queries/0_stateless/01526_max_untracked_memory.sh
+++ b/tests/queries/0_stateless/01526_max_untracked_memory.sh
@@ -1,6 +1,5 @@
 #!/usr/bin/env bash
-# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-cpu-aarch64
-# requires TraceCollector, does not available under sanitizers and aarch64
+# Tags: no-tsan, no-asan, no-ubsan, no-msan
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
diff --git a/tests/queries/0_stateless/01569_query_profiler_big_query_id.sh b/tests/queries/0_stateless/01569_query_profiler_big_query_id.sh
index e54783e9655..941ab216d0b 100755
--- a/tests/queries/0_stateless/01569_query_profiler_big_query_id.sh
+++ b/tests/queries/0_stateless/01569_query_profiler_big_query_id.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-cpu-aarch64
+# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
@@ -9,4 +9,3 @@ query_id="aggregating_merge_tree_simple_aggregate_function_string_query100_profi
 ${CLICKHOUSE_CLIENT} --query="select sleep(1)" --query_id="$query_id" --query_profiler_real_time_period_ns=10000000
 ${CLICKHOUSE_CLIENT} --query="system flush logs"
 ${CLICKHOUSE_CLIENT} --query="select count(*) > 1 from system.trace_log where query_id = '$query_id'"
-
diff --git a/tests/queries/0_stateless/02161_addressToLineWithInlines.sql b/tests/queries/0_stateless/02161_addressToLineWithInlines.sql
index b6b497b4b55..78b414378f1 100644
--- a/tests/queries/0_stateless/02161_addressToLineWithInlines.sql
+++ b/tests/queries/0_stateless/02161_addressToLineWithInlines.sql
@@ -1,4 +1,4 @@
--- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-cpu-aarch64
+-- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug
 
 SET allow_introspection_functions = 0;
 SELECT addressToLineWithInlines(1); -- { serverError 446 }
diff --git a/tests/queries/0_stateless/02252_jit_profile_events.sql b/tests/queries/0_stateless/02252_jit_profile_events.sql
index fb7f806c46b..487f43737e8 100644
--- a/tests/queries/0_stateless/02252_jit_profile_events.sql
+++ b/tests/queries/0_stateless/02252_jit_profile_events.sql
@@ -1,4 +1,4 @@
--- Tags: no-fasttest, no-parallel, no-cpu-aarch64, no-msan
+-- Tags: no-fasttest, no-parallel, no-msan
 
 SET compile_expressions = 1;
 SET min_count_to_compile_expression = 0;
diff --git a/tests/queries/0_stateless/02818_memory_profiler_sample_min_max_allocation_size.sh b/tests/queries/0_stateless/02818_memory_profiler_sample_min_max_allocation_size.sh
index b1fbea26da7..9234c428147 100755
--- a/tests/queries/0_stateless/02818_memory_profiler_sample_min_max_allocation_size.sh
+++ b/tests/queries/0_stateless/02818_memory_profiler_sample_min_max_allocation_size.sh
@@ -1,6 +1,5 @@
 #!/usr/bin/env bash
-# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-cpu-aarch64, no-random-settings
-# requires TraceCollector, does not available under sanitizers and aarch64
+# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-random-settings
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh

From 70abdf7a414ee57d59df51f6cf5ec435e2830f9e Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Mon, 4 Mar 2024 17:32:22 +0000
Subject: [PATCH 58/66] Small improvements in JSON schema inference

---
 docs/en/interfaces/schema-inference.md        | 64 +++++++++++++
 src/Core/Settings.h                           |  3 +-
 src/Core/SettingsChangesHistory.h             |  3 +
 src/Formats/EscapingRuleUtils.cpp             |  9 +-
 src/Formats/FormatFactory.cpp                 |  1 +
 src/Formats/FormatSettings.h                  |  1 +
 src/Formats/SchemaInferenceUtils.cpp          | 91 +++++++++++--------
 ...02982_dont_infer_exponent_floats.reference |  1 +
 .../02982_dont_infer_exponent_floats.sql      |  3 +
 ...erence_ambiguous_paths_as_string.reference |  3 +
 ...es_inference_ambiguous_paths_as_string.sql |  4 +
 11 files changed, 142 insertions(+), 41 deletions(-)
 create mode 100644 tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference
 create mode 100644 tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql

diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md
index 39ae69eaef4..f2e9136d1db 100644
--- a/docs/en/interfaces/schema-inference.md
+++ b/docs/en/interfaces/schema-inference.md
@@ -549,6 +549,48 @@ Result:
 └───────┴─────────────────────────────────────────────────────────────────────────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 
+##### input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects
+
+Enabling this setting allows to use String type for ambiguous paths during named tuples inference from JSON objects (when `input_format_json_try_infer_named_tuples_from_objects` is enabled) instead of an exception.
+It allows to read JSON objects as named Tuples even if there are ambiguous paths.
+
+Disabled by default.
+
+**Examples**
+
+With disabled setting:
+```sql
+SET input_format_json_try_infer_named_tuples_from_objects = 1;
+SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 0;
+DESC format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
+```
+Result:
+
+```text
+Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error:
+Code: 117. DB::Exception: JSON objects have ambiguous paths: 'a' (with type Int64) and 'a.b'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1).
+You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE)
+```
+
+With enabled setting:
+```sql
+SET input_format_json_try_infer_named_tuples_from_objects = 1;
+SET input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = 1;
+DESC format(JSONEachRow, '{"obj" : "a" : 42}, {"obj" : {"a" : {"b" : "Hello"}}}');
+SELECT * FROM format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : "Hello"}}}');
+```
+
+Result:
+```text
+┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ obj  │ Tuple(a Nullable(String))     │              │                    │         │                  │                │
+└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+┌─obj─────────────────┐
+│ ('42')              │
+│ ('{"b" : "Hello"}') │
+└─────────────────────┘
+```
+
 ##### input_format_json_read_objects_as_strings
 
 Enabling this setting allows reading nested JSON objects as strings.
@@ -1554,6 +1596,28 @@ DESC format(JSONEachRow, $$
 └──────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 
+#### input_format_try_infer_exponent_floats
+
+If enabled, ClickHouse will try to infer floats in exponential form for text formats (except JSON where numbers in exponential form are always inferred).
+
+Disabled by default.
+
+**Example**
+
+```sql
+SET input_format_try_infer_exponent_floats = 1;
+DESC format(CSV,
+$$1.1E10
+2.3e-12
+42E00
+$$)
+```
+```response
+┌─name─┬─type──────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ c1   │ Nullable(Float64) │              │                    │         │                  │                │
+└──────┴───────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
 ## Self describing formats {#self-describing-formats}
 
 Self-describing formats contain information about the structure of the data in the data itself,
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index ae6ea165cc9..3f71223c910 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -1018,6 +1018,7 @@ class IColumn;
     M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
     M(Bool, input_format_json_read_arrays_as_strings, true, "Allow to parse JSON arrays as strings in JSON input formats", 0) \
     M(Bool, input_format_json_try_infer_named_tuples_from_objects, true, "Try to infer named tuples from JSON objects in JSON input formats", 0) \
+    M(Bool, input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects, false, "Use String type instead of an exception in case of ambiguous paths in JSON objects during named tuples inference", 0) \
     M(Bool, input_format_json_infer_incomplete_types_as_strings, true, "Use type String for keys that contains only Nulls or empty objects/arrays during schema inference in JSON input formats", 0) \
     M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
     M(Bool, input_format_json_ignore_unknown_keys_in_named_tuple, true, "Ignore unknown keys in json object for named tuples", 0) \
@@ -1025,7 +1026,7 @@ class IColumn;
     M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
     M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
     M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
-    M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats", 0) \
+    M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \
     M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
     M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
     M(Bool, output_format_protobuf_nullables_with_google_wrappers, false, "When serializing Nullable columns with Google wrappers, serialize default values as empty wrappers. If turned off, default and null values are not serialized", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index face1def4b4..f473d677ecd 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -85,6 +85,9 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
+    {"24.3", {
+                 {"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"},
+             }},
     {"24.2", {
               {"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
               {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp
index 16f8a341e03..577988871f3 100644
--- a/src/Formats/EscapingRuleUtils.cpp
+++ b/src/Formats/EscapingRuleUtils.cpp
@@ -450,8 +450,10 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
             break;
         case FormatSettings::EscapingRule::JSON:
             result += fmt::format(
-                ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, read_numbers_as_strings={}, "
-                "read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}",
+                ", try_infer_numbers_from_strings={}, read_bools_as_numbers={}, read_bools_as_strings={}, read_objects_as_strings={}, "
+                "read_numbers_as_strings={}, "
+                "read_arrays_as_strings={}, try_infer_objects_as_tuples={}, infer_incomplete_types_as_strings={}, try_infer_objects={}, "
+                "use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects={}",
                 settings.json.try_infer_numbers_from_strings,
                 settings.json.read_bools_as_numbers,
                 settings.json.read_bools_as_strings,
@@ -460,7 +462,8 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
                 settings.json.read_arrays_as_strings,
                 settings.json.try_infer_objects_as_tuples,
                 settings.json.infer_incomplete_types_as_strings,
-                settings.json.allow_object_type);
+                settings.json.allow_object_type,
+                settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
             break;
         default:
             break;
diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index a4a08d762b9..ccead6688a7 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -105,6 +105,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
     format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
     format_settings.json.skip_null_value_in_named_tuples = settings.output_format_json_skip_null_value_in_named_tuples;
     format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
+    format_settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = settings.input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects;
     format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
     format_settings.json.ignore_unknown_keys_in_named_tuple = settings.input_format_json_ignore_unknown_keys_in_named_tuple;
     format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h
index 01c3632c730..42b21c77cef 100644
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@@ -202,6 +202,7 @@ struct FormatSettings
         bool quote_decimals = false;
         bool escape_forward_slashes = true;
         bool read_named_tuples_as_objects = false;
+        bool use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects = false;
         bool write_named_tuples_as_objects = false;
         bool skip_null_value_in_named_tuples = false;
         bool defaults_for_missing_elements_in_named_tuple = false;
diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp
index 06b52e7a7a2..998f97fae0d 100644
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@@ -136,7 +136,7 @@ namespace
 
         bool empty() const { return paths.empty(); }
 
-        DataTypePtr finalize() const
+        DataTypePtr finalize(bool use_string_type_for_ambiguous_paths = false) const
         {
             if (paths.empty())
                 throw Exception(ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA, "Cannot infer named Tuple from JSON object because object is empty");
@@ -167,7 +167,7 @@ namespace
                 current_node->leaf_type = type;
             }
 
-            return root_node.getType();
+            return root_node.getType(use_string_type_for_ambiguous_paths);
         }
 
     private:
@@ -180,7 +180,7 @@ namespace
             /// Store path to this node for better exception message in case of ambiguous paths.
             String path;
 
-            DataTypePtr getType() const
+            DataTypePtr getType(bool use_string_type_for_ambiguous_paths) const
             {
                 /// Check if we have ambiguous paths.
                 /// For example:
@@ -191,7 +191,16 @@ namespace
                 /// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
                 /// but it's a valid case and we should ignore path 'a.b'.
                 if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
-                    throw Exception(ErrorCodes::INCORRECT_DATA, "JSON objects have ambiguous paths: '{}' with type {} and '{}'", path, leaf_type->getName(), nodes.begin()->second.path);
+                {
+                    if (use_string_type_for_ambiguous_paths)
+                        return std::make_shared<DataTypeString>();
+                    throw Exception(
+                        ErrorCodes::INCORRECT_DATA,
+                        "JSON objects have ambiguous paths: '{}' (with type {}) and '{}'. You can enable setting "
+                        "input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type "
+                        "for path '{}'",
+                        path, leaf_type->getName(), nodes.begin()->second.path, path);
+                }
 
                 if (nodes.empty())
                     return leaf_type;
@@ -203,7 +212,7 @@ namespace
                 for (const auto & [name, node] : nodes)
                 {
                     node_names.push_back(name);
-                    node_types.push_back(node.getType());
+                    node_types.push_back(node.getType(use_string_type_for_ambiguous_paths));
                 }
 
                 return std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
@@ -866,13 +875,15 @@ namespace
         return std::make_shared<DataTypeTuple>(nested_types);
     }
 
+    template <bool is_json>
     bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings)
     {
-        if (settings.try_infer_exponent_floats)
+        if (is_json || settings.try_infer_exponent_floats)
             return tryReadFloatText(value, buf);
         return tryReadFloatTextNoExponent(value, buf);
     }
 
+    template <bool is_json>
     DataTypePtr tryInferNumber(ReadBuffer & buf, const FormatSettings & settings)
     {
         if (buf.eof())
@@ -911,7 +922,7 @@ namespace
                     buf.position() = number_start;
                 }
 
-                if (tryReadFloat(tmp_float, buf, settings))
+                if (tryReadFloat<is_json>(tmp_float, buf, settings))
                 {
                     if (read_int && buf.position() == int_end)
                         return std::make_shared<DataTypeInt64>();
@@ -945,7 +956,7 @@ namespace
                 peekable_buf.rollbackToCheckpoint(true);
             }
 
-            if (tryReadFloat(tmp_float, peekable_buf, settings))
+            if (tryReadFloat<is_json>(tmp_float, peekable_buf, settings))
             {
                 /// Float parsing reads no fewer bytes than integer parsing,
                 /// so position of the buffer is either the same, or further.
@@ -957,7 +968,7 @@ namespace
                 return std::make_shared<DataTypeFloat64>();
             }
         }
-        else if (tryReadFloat(tmp_float, buf, settings))
+        else if (tryReadFloat<is_json>(tmp_float, buf, settings))
         {
             return std::make_shared<DataTypeFloat64>();
         }
@@ -966,6 +977,36 @@ namespace
         return nullptr;
     }
 
+    template <bool is_json>
+    DataTypePtr tryInferNumberFromStringImpl(std::string_view field, const FormatSettings & settings)
+    {
+        ReadBufferFromString buf(field);
+
+        if (settings.try_infer_integers)
+        {
+            Int64 tmp_int;
+            if (tryReadIntText(tmp_int, buf) && buf.eof())
+                return std::make_shared<DataTypeInt64>();
+
+            /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
+            buf.position() = buf.buffer().begin();
+
+            /// In case of Int64 overflow, try to infer UInt64
+            UInt64 tmp_uint;
+            if (tryReadIntText(tmp_uint, buf) && buf.eof())
+                return std::make_shared<DataTypeUInt64>();
+        }
+
+        /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
+        buf.position() = buf.buffer().begin();
+
+        Float64 tmp;
+        if (tryReadFloat<is_json>(tmp, buf, settings) && buf.eof())
+            return std::make_shared<DataTypeFloat64>();
+
+        return nullptr;
+    }
+
     template <bool is_json>
     DataTypePtr tryInferString(ReadBuffer & buf, const FormatSettings & settings, JSONInferenceInfo * json_info)
     {
@@ -995,7 +1036,7 @@ namespace
         {
             if (settings.json.try_infer_numbers_from_strings)
             {
-                if (auto number_type = tryInferNumberFromString(field, settings))
+                if (auto number_type = tryInferNumberFromStringImpl<true>(field, settings))
                 {
                     json_info->numbers_parsed_from_json_strings.insert(number_type.get());
                     return number_type;
@@ -1238,7 +1279,7 @@ namespace
         }
 
         /// Number
-        return tryInferNumber(buf, settings);
+        return tryInferNumber<is_json>(buf, settings);
     }
 }
 
@@ -1294,7 +1335,7 @@ void transformFinalInferredJSONTypeIfNeededImpl(DataTypePtr & data_type, const F
             return;
         }
 
-        data_type = json_paths->finalize();
+        data_type = json_paths->finalize(settings.json.use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects);
         transformFinalInferredJSONTypeIfNeededImpl(data_type, settings, json_info, remain_nothing_types);
         return;
     }
@@ -1377,31 +1418,7 @@ void transformFinalInferredJSONTypeIfNeeded(DataTypePtr & data_type, const Forma
 
 DataTypePtr tryInferNumberFromString(std::string_view field, const FormatSettings & settings)
 {
-    ReadBufferFromString buf(field);
-
-    if (settings.try_infer_integers)
-    {
-        Int64 tmp_int;
-        if (tryReadIntText(tmp_int, buf) && buf.eof())
-            return std::make_shared<DataTypeInt64>();
-
-        /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
-        buf.position() = buf.buffer().begin();
-
-        /// In case of Int64 overflow, try to infer UInt64
-        UInt64 tmp_uint;
-        if (tryReadIntText(tmp_uint, buf) && buf.eof())
-            return std::make_shared<DataTypeUInt64>();
-    }
-
-    /// We can safely get back to the start of buffer, because we read from a string and we didn't reach eof.
-    buf.position() = buf.buffer().begin();
-
-    Float64 tmp;
-    if (tryReadFloat(tmp, buf, settings) && buf.eof())
-        return std::make_shared<DataTypeFloat64>();
-
-    return nullptr;
+    return tryInferNumberFromStringImpl<false>(field, settings);
 }
 
 DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const FormatSettings & settings)
diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference
index b6d1ff865e5..47e9b86237a 100644
--- a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference
+++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.reference
@@ -1,2 +1,3 @@
 c1	Nullable(String)					
 c1	Nullable(Float64)					
+x	Nullable(Float64)					
diff --git a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql
index 2a281e898f1..4f78855f5ce 100644
--- a/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql
+++ b/tests/queries/0_stateless/02982_dont_infer_exponent_floats.sql
@@ -1,2 +1,5 @@
 DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 0;
 DESC format(CSV, '1E20\n1.1E20') settings input_format_try_infer_exponent_floats = 1;
+-- This setting should not take affect on JSON formats
+DESC format(JSONEachRow, '{"x" : 1.1e20}') settings input_format_try_infer_exponent_floats = 0;
+
diff --git a/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference
new file mode 100644
index 00000000000..0318b136ade
--- /dev/null
+++ b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.reference
@@ -0,0 +1,3 @@
+obj	Tuple(\n    a Nullable(String))					
+('42')
+('{"b" : 42}')
diff --git a/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql
new file mode 100644
index 00000000000..4b986c94868
--- /dev/null
+++ b/tests/queries/0_stateless/03004_json_named_tuples_inference_ambiguous_paths_as_string.sql
@@ -0,0 +1,4 @@
+set input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects=1;
+desc format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}');
+select * from format(JSONEachRow, '{"obj" : {"a" : 42}}, {"obj" : {"a" : {"b" : 42}}}');
+

From 9a0546168094d38692725f89677077e32bd144b5 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Mon, 4 Mar 2024 17:49:33 +0000
Subject: [PATCH 59/66] Better exception message

---
 docs/en/interfaces/schema-inference.md |  2 +-
 src/Formats/SchemaInferenceUtils.cpp   | 45 ++++++++++++++------------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md
index f2e9136d1db..05fae994cbe 100644
--- a/docs/en/interfaces/schema-inference.md
+++ b/docs/en/interfaces/schema-inference.md
@@ -568,7 +568,7 @@ Result:
 
 ```text
 Code: 636. DB::Exception: The table structure cannot be extracted from a JSONEachRow format file. Error:
-Code: 117. DB::Exception: JSON objects have ambiguous paths: 'a' (with type Int64) and 'a.b'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1).
+Code: 117. DB::Exception: JSON objects have ambiguous data: in some objects path 'a' has type 'Int64' and in some - 'Tuple(b String)'. You can enable setting input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type for path 'a'. (INCORRECT_DATA) (version 24.3.1.1).
 You can specify the structure manually. (CANNOT_EXTRACT_TABLE_STRUCTURE)
 ```
 
diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp
index 998f97fae0d..cb574551d26 100644
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@@ -182,26 +182,6 @@ namespace
 
             DataTypePtr getType(bool use_string_type_for_ambiguous_paths) const
             {
-                /// Check if we have ambiguous paths.
-                /// For example:
-                /// 'a.b.c' : Int32 and 'a.b' : String
-                /// Also check if leaf type is Nothing, because the next situation is possible:
-                /// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing)
-                /// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32
-                /// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
-                /// but it's a valid case and we should ignore path 'a.b'.
-                if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
-                {
-                    if (use_string_type_for_ambiguous_paths)
-                        return std::make_shared<DataTypeString>();
-                    throw Exception(
-                        ErrorCodes::INCORRECT_DATA,
-                        "JSON objects have ambiguous paths: '{}' (with type {}) and '{}'. You can enable setting "
-                        "input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type "
-                        "for path '{}'",
-                        path, leaf_type->getName(), nodes.begin()->second.path, path);
-                }
-
                 if (nodes.empty())
                     return leaf_type;
 
@@ -215,7 +195,30 @@ namespace
                     node_types.push_back(node.getType(use_string_type_for_ambiguous_paths));
                 }
 
-                return std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
+                auto tuple_type = std::make_shared<DataTypeTuple>(std::move(node_types), std::move(node_names));
+
+                /// Check if we have ambiguous paths.
+                /// For example:
+                /// 'a.b.c' : Int32 and 'a.b' : String
+                /// Also check if leaf type is Nothing, because the next situation is possible:
+                /// {"a" : {"b" : null}} -> 'a.b' : Nullable(Nothing)
+                /// {"a" : {"b" : {"c" : 42}}} -> 'a.b.c' : Int32
+                /// And after merge we will have ambiguous paths 'a.b.c' : Int32 and 'a.b' : Nullable(Nothing),
+                /// but it's a valid case and we should ignore path 'a.b'.
+                if (leaf_type && !isNothing(removeNullable(leaf_type)) && !nodes.empty())
+                {
+                    if (use_string_type_for_ambiguous_paths)
+                        return std::make_shared<DataTypeString>();
+
+                    throw Exception(
+                        ErrorCodes::INCORRECT_DATA,
+                        "JSON objects have ambiguous data: in some objects path '{}' has type '{}' and in some - '{}'. You can enable setting "
+                        "input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects to use String type "
+                        "for path '{}'",
+                        path, leaf_type->getName(), tuple_type->getName(), path);
+                }
+
+                return tuple_type;
             }
         };
 

From bd2b0b4338923328198ed6e27f872324646c5a0c Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Mon, 4 Mar 2024 20:01:26 +0100
Subject: [PATCH 60/66] Export only str and bool for build configs

---
 tests/ci/ci_config.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index dd175177858..b9c332e8e18 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -264,6 +264,8 @@ class BuildConfig:
         def process(field_name: str, field: Union[bool, str]) -> str:
             if isinstance(field, bool):
                 field = str(field).lower()
+            elif not isinstance(field, str):
+                field = ""
             if export:
                 return f"export BUILD_{field_name.upper()}={repr(field)}"
             return f"BUILD_{field_name.upper()}={field}"

From a6caace5ecb299184b9f07e3c3e4dc38a210b924 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Mon, 4 Mar 2024 20:03:45 +0100
Subject: [PATCH 61/66] Add a new runner type

---
 tests/ci/lambda_shared_package/lambda_shared/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/ci/lambda_shared_package/lambda_shared/__init__.py b/tests/ci/lambda_shared_package/lambda_shared/__init__.py
index 9e6c5dde298..043a0310d11 100644
--- a/tests/ci/lambda_shared_package/lambda_shared/__init__.py
+++ b/tests/ci/lambda_shared_package/lambda_shared/__init__.py
@@ -20,11 +20,12 @@ RUNNER_TYPE_LABELS = [
     "style-checker",
     "style-checker-aarch64",
     # private runners
-    "private-style-checker",
     "private-builder",
+    "private-clickpipes",
     "private-func-tester",
     "private-fuzzer-unit-tester",
     "private-stress-tester",
+    "private-style-checker",
 ]
 
 

From 24505ba9b34c8857c2617974323a4f3302ce6d7c Mon Sep 17 00:00:00 2001
From: Michael Kolupaev <michael.kolupaev@clickhouse.com>
Date: Mon, 4 Mar 2024 19:36:38 +0000
Subject: [PATCH 62/66] Move userspace page cache settings to the correct
 section of SettingsChangeHistory.h

---
 src/Core/SettingsChangesHistory.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index a7f96679bbe..2952e27bb39 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -85,8 +85,11 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
-    {"24.2", {
-              {"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
+    {"24.3", {{"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"},
+              {"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"},
+              {"page_cache_inject_eviction", false, false, "Added userspace page cache"},
+              }},
+    {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
               {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},
               {"output_format_values_escape_quote_with_quote", false, false, "If true escape ' with '', otherwise quoted with \\'"},
               {"output_format_pretty_single_large_number_tip_threshold", 0, 1'000'000, "Print a readable number tip on the right side of the table if the block consists of a single number which exceeds this value (except 0)"},
@@ -114,9 +117,6 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"},
               {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."},
               {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."},
-              {"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"},
-              {"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"},
-              {"page_cache_inject_eviction", false, false, "Added userspace page cache"},
               }},
     {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."},
               {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"},

From a6f5323c38d8420765332e3e778399de3fc69acf Mon Sep 17 00:00:00 2001
From: Max Kainov <max.kainov@clickhouse.com>
Date: Tue, 5 Mar 2024 10:09:17 +0000
Subject: [PATCH 63/66] CI: Fix pending status for build report in backports 
 #do_not_test

---
 tests/ci/build_report_check.py | 2 +-
 tests/ci/ci_config.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py
index 94e429ad77b..48640f15ac0 100644
--- a/tests/ci/build_report_check.py
+++ b/tests/ci/build_report_check.py
@@ -51,7 +51,7 @@ def main():
     builds_for_check = CI_CONFIG.get_builds_for_report(
         build_check_name,
         release=pr_info.is_release(),
-        backport=pr_info.head_ref.startswith("backport"),
+        backport=pr_info.head_ref.startswith("backport/"),
     )
     required_builds = len(builds_for_check)
     missing_builds = 0
diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index b9c332e8e18..b9ee5670066 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -586,7 +586,7 @@ class CIConfig:
                 Build.PACKAGE_TSAN,
                 Build.PACKAGE_DEBUG,
             ]
-        if release and report_name == JobNames.BUILD_CHECK_SPECIAL:
+        if (release or backport) and report_name == JobNames.BUILD_CHECK_SPECIAL:
             return [
                 Build.BINARY_DARWIN,
                 Build.BINARY_DARWIN_AARCH64,

From edf2dc91687786742687624af40d1ac93c92f16b Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Tue, 5 Mar 2024 10:20:57 +0000
Subject: [PATCH 64/66] Update version_date.tsv and changelogs after
 v23.8.10.43-lts

---
 docs/changelogs/v23.8.10.43-lts.md   | 39 ++++++++++++++++++++++++++++
 utils/list-versions/version_date.tsv |  1 +
 2 files changed, 40 insertions(+)
 create mode 100644 docs/changelogs/v23.8.10.43-lts.md

diff --git a/docs/changelogs/v23.8.10.43-lts.md b/docs/changelogs/v23.8.10.43-lts.md
new file mode 100644
index 00000000000..0093467d129
--- /dev/null
+++ b/docs/changelogs/v23.8.10.43-lts.md
@@ -0,0 +1,39 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v23.8.10.43-lts (a278225bba9) FIXME as compared to v23.8.9.54-lts (192a1d231fa)
+
+#### Improvement
+* Backported in [#58819](https://github.com/ClickHouse/ClickHouse/issues/58819): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)).
+* Backported in [#60286](https://github.com/ClickHouse/ClickHouse/issues/60286): Copy S3 file GCP fallback to buffer copy in case GCP returned `Internal Error` with `GATEWAY_TIMEOUT` HTTP error code. [#60164](https://github.com/ClickHouse/ClickHouse/pull/60164) ([Maksim Kita](https://github.com/kitaisreal)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#59879](https://github.com/ClickHouse/ClickHouse/issues/59879): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Background merges correctly use temporary data storage in the cache [#57275](https://github.com/ClickHouse/ClickHouse/pull/57275) ([vdimir](https://github.com/vdimir)).
+* MergeTree mutations reuse source part index granularity [#57352](https://github.com/ClickHouse/ClickHouse/pull/57352) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix JSONExtract function for LowCardinality(Nullable) columns [#58808](https://github.com/ClickHouse/ClickHouse/pull/58808) ([vdimir](https://github.com/vdimir)).
+* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix error "Read beyond last offset" for AsynchronousBoundedReadBuffer [#59630](https://github.com/ClickHouse/ClickHouse/pull/59630) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix query start time on non initial queries [#59662](https://github.com/ClickHouse/ClickHouse/pull/59662) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)).
+* rabbitmq: fix having neither acked nor nacked messages [#59775](https://github.com/ClickHouse/ClickHouse/pull/59775) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix cosineDistance crash with Nullable [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Fix rare race in external sort/aggregation with temporary data in cache [#58013](https://github.com/ClickHouse/ClickHouse/pull/58013) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix 02720_row_policy_column_with_dots [#59453](https://github.com/ClickHouse/ClickHouse/pull/59453) ([Duc Canh Le](https://github.com/canhld94)).
+* Pin python dependencies in stateless tests [#59663](https://github.com/ClickHouse/ClickHouse/pull/59663) ([Raúl Marín](https://github.com/Algunenano)).
+* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Remove broken test while we fix it [#60547](https://github.com/ClickHouse/ClickHouse/pull/60547) ([Raúl Marín](https://github.com/Algunenano)).
+
diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 572ceddf590..53bf705637d 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -25,6 +25,7 @@ v23.9.4.11-stable	2023-11-08
 v23.9.3.12-stable	2023-10-31
 v23.9.2.56-stable	2023-10-19
 v23.9.1.1854-stable	2023-09-29
+v23.8.10.43-lts	2024-03-05
 v23.8.9.54-lts	2024-01-05
 v23.8.8.20-lts	2023-11-25
 v23.8.7.24-lts	2023-11-17

From 738c481cce72716fb8e6cd2295ee22040a2a14a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Tue, 5 Mar 2024 10:36:42 +0000
Subject: [PATCH 65/66] Fix fuzzer report

---
 docker/test/fuzzer/run-fuzzer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index 9358e88e1e8..ccf450c94f2 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -387,7 +387,7 @@ if [ -f core.zst ]; then
 fi
 
 # Keep all the lines in the paragraphs containing <Fatal> that either contain <Fatal> or don't start with 20... (year)
-sed -n '/<Fatal>/,/^$/p' s.log | awk '/<Fatal>/ || !/^20/' server.log > fatal.log ||:
+sed -n '/<Fatal>/,/^$/p' server.log | awk '/<Fatal>/ || !/^20/' > fatal.log ||:
 FATAL_LINK=''
 if [ -s fatal.log ]; then
     FATAL_LINK='<a href="fatal.log">fatal.log</a>'

From 23ec4d4c6e386b67953cee01be5c740f47e39c78 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Tue, 5 Mar 2024 11:53:19 +0000
Subject: [PATCH 66/66] Update version_date.tsv and changelogs after
 v23.3.20.27-lts

---
 docs/changelogs/v23.3.20.27-lts.md   | 29 ++++++++++++++++++++++++++++
 utils/list-versions/version_date.tsv |  1 +
 2 files changed, 30 insertions(+)
 create mode 100644 docs/changelogs/v23.3.20.27-lts.md

diff --git a/docs/changelogs/v23.3.20.27-lts.md b/docs/changelogs/v23.3.20.27-lts.md
new file mode 100644
index 00000000000..9f49e47f0bc
--- /dev/null
+++ b/docs/changelogs/v23.3.20.27-lts.md
@@ -0,0 +1,29 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v23.3.20.27-lts (cc974ba4f81) FIXME as compared to v23.3.19.32-lts (c4d4ca8ec02)
+
+#### Improvement
+* Backported in [#58818](https://github.com/ClickHouse/ClickHouse/issues/58818): Add `SYSTEM JEMALLOC PURGE` for purging unused jemalloc pages, `SYSTEM JEMALLOC [ ENABLE | DISABLE | FLUSH ] PROFILE` for controlling jemalloc profile if the profiler is enabled. Add jemalloc-related 4LW command in Keeper: `jmst` for dumping jemalloc stats, `jmfp`, `jmep`, `jmdp` for controlling jemalloc profile if the profiler is enabled. [#58665](https://github.com/ClickHouse/ClickHouse/pull/58665) ([Antonio Andelic](https://github.com/antonio2368)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#59877](https://github.com/ClickHouse/ClickHouse/issues/59877): If you want to run initdb scripts every time when ClickHouse container is starting you shoud initialize environment varible CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS. [#59808](https://github.com/ClickHouse/ClickHouse/pull/59808) ([Alexander Nikolaev](https://github.com/AlexNik)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Fix working with read buffers in StreamingFormatExecutor [#57438](https://github.com/ClickHouse/ClickHouse/pull/57438) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix double destroy call on exception throw in addBatchLookupTable8 [#58745](https://github.com/ClickHouse/ClickHouse/pull/58745) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix: LIMIT BY and LIMIT in distributed query [#59153](https://github.com/ClickHouse/ClickHouse/pull/59153) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix translate() with FixedString input [#59356](https://github.com/ClickHouse/ClickHouse/pull/59356) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix leftPad / rightPad function with FixedString input [#59739](https://github.com/ClickHouse/ClickHouse/pull/59739) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix cosineDistance crash with Nullable [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Fix possible race in ManyAggregatedData dtor. [#58624](https://github.com/ClickHouse/ClickHouse/pull/58624) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Make ZooKeeper actually sequentialy consistent [#59735](https://github.com/ClickHouse/ClickHouse/pull/59735) ([Alexander Tokmakov](https://github.com/tavplubix)).
+
diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 53bf705637d..e372e407ce1 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -55,6 +55,7 @@ v23.4.4.16-stable	2023-06-17
 v23.4.3.48-stable	2023-06-12
 v23.4.2.11-stable	2023-05-02
 v23.4.1.1943-stable	2023-04-27
+v23.3.20.27-lts	2024-03-05
 v23.3.19.32-lts	2024-01-05
 v23.3.18.15-lts	2023-11-25
 v23.3.17.13-lts	2023-11-17