From fb17d0d16939434be0990b18cfdd1fecc6227bb1 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Tue, 23 Jan 2024 11:31:12 +0000
Subject: [PATCH 001/192] Introduce EmbeddedRocksDBBulkSink

No mem-table: an SST file is built from chunk, then import to rocksdb.

Also add RocksDBSettings as table level settings for StorageEmbeddedRocksDB

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Interpreters/IKeyValueEntity.h            |   1 +
 .../RocksDB/EmbeddedRocksDBBulkSink.cpp       | 149 ++++++++++++++++++
 .../RocksDB/EmbeddedRocksDBBulkSink.h         |  56 +++++++
 src/Storages/RocksDB/RocksDBSettings.cpp      |  41 +++++
 src/Storages/RocksDB/RocksDBSettings.h        |  39 +++++
 .../RocksDB/StorageEmbeddedRocksDB.cpp        |  42 +++--
 src/Storages/RocksDB/StorageEmbeddedRocksDB.h |  11 +-
 .../02956_rocksdb_bulk_sink.reference         |   1 +
 .../0_stateless/02956_rocksdb_bulk_sink.sql   |   3 +
 9 files changed, 326 insertions(+), 17 deletions(-)
 create mode 100644 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
 create mode 100644 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
 create mode 100644 src/Storages/RocksDB/RocksDBSettings.cpp
 create mode 100644 src/Storages/RocksDB/RocksDBSettings.h
 create mode 100644 tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
 create mode 100644 tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
diff --git a/src/Interpreters/IKeyValueEntity.h b/src/Interpreters/IKeyValueEntity.h
index d1ceda57f0e..76f652ac2d0 100644
--- a/src/Interpreters/IKeyValueEntity.h
+++ b/src/Interpreters/IKeyValueEntity.h
@@ -2,6 +2,7 @@
 
 #include <Core/Names.h>
 #include <Processors/Chunk.h>
+#include <Core/ColumnsWithTypeAndName.h>
 
 namespace DB
 {
diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
new file mode 100644
index 00000000000..1c5d48fe62d
--- /dev/null
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -0,0 +1,149 @@
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <filesystem>
+#include <memory>
+#include <optional>
+#include <random>
+#include <stdatomic.h>
+#include <IO/WriteBufferFromString.h>
+#include <Storages/RocksDB/EmbeddedRocksDBBulkSink.h>
+#include <Storages/RocksDB/StorageEmbeddedRocksDB.h>
+
+#include <Columns/ColumnString.h>
+#include <Core/SortDescription.h>
+#include <DataTypes/DataTypeString.h>
+#include <IO/WriteHelpers.h>
+#include <Interpreters/Context.h>
+#include <rocksdb/options.h>
+#include <rocksdb/status.h>
+#include <rocksdb/utilities/db_ttl.h>
+#include <Common/getRandomASCIIString.h>
+#include <Common/CurrentThread.h>
+#include <Common/MemoryTrackerBlockerInThread.h>
+#include <Common/logger_useful.h>
+#include <Common/scope_guard_safe.h>
+#include <Common/setThreadName.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ROCKSDB_ERROR;
+}
+
+static const IColumn::Permutation & getAscendingPermutation(const IColumn & column, IColumn::Permutation & perm)
+{
+    column.getPermutation(IColumn::PermutationSortDirection::Ascending, IColumn::PermutationSortStability::Stable, 0, 1, perm);
+    return perm;
+}
+
+static rocksdb::Status buildSSTFile(const String & path, const ColumnString & keys, const ColumnString & values, const std::optional<IColumn::Permutation> & perm_ = {})
+{
+    IColumn::Permutation calculated_perm;
+    const IColumn::Permutation & perm = perm_ ? *perm_ : getAscendingPermutation(keys, calculated_perm);
+
+    rocksdb::SstFileWriter sst_file_writer(rocksdb::EnvOptions{}, rocksdb::Options{});
+    auto status = sst_file_writer.Open(path);
+    if (!status.ok())
+        return status;
+
+    auto rows = perm.size();
+    WriteBufferFromOwnString wb_value;
+    for (size_t i = 0; i < rows; ++i)
+    {
+        auto row = perm[i];
+
+        status = sst_file_writer.Put(keys.getDataAt(row).toView(), values.getDataAt(row).toView());
+
+        /// There could be duplicated keys in chunk, thus Put may give IsInvalidArgument. This is ok, as we're certain that
+        /// keys are sorted in ascending order.
+        if (!status.ok() && !status.IsInvalidArgument())
+            return status;
+    }
+    sst_file_writer.Finish();
+    return rocksdb::Status::OK();
+}
+
+EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink(
+    ContextPtr context_, StorageEmbeddedRocksDB & storage_, const StorageMetadataPtr & metadata_snapshot_)
+    : SinkToStorage(metadata_snapshot_->getSampleBlock()), WithContext(context_), storage(storage_), metadata_snapshot(metadata_snapshot_)
+{
+    for (const auto & elem : getHeader())
+    {
+        if (elem.name == storage.primary_key)
+            break;
+        ++primary_key_pos;
+    }
+    serializations = getHeader().getSerializations();
+    /// If max_insert_threads > 1 we may have multiple EmbeddedRocksDBBulkSink and getContext()->getCurrentQueryId() is not guarantee to
+    /// to have a distinct path
+    insert_directory_queue = fs::path(storage.getDataPaths()[0]) / (getContext()->getCurrentQueryId() + "_" + getRandomASCIIString(8));
+    fs::create_directory(insert_directory_queue);
+
+    // serialized_key_column = ColumnString::create();
+    // serialized_value_column = ColumnString::create();
+    // writer_key = std::make_unique<WriteBufferFromVector<ColumnString::Chars>>(serialized_key_column->getChars());
+    // writer_value = std::make_unique<WriteBufferFromVector<ColumnString::Chars>>(serialized_value_column->getChars());
+}
+
+EmbeddedRocksDBBulkSink::~EmbeddedRocksDBBulkSink()
+{
+    if (fs::exists(insert_directory_queue))
+        fs::remove_all(insert_directory_queue);
+}
+
+void EmbeddedRocksDBBulkSink::consume(Chunk chunk)
+{
+    auto rows = chunk.getNumRows();
+    const auto columns = chunk.detachColumns();
+
+    auto serialized_key_column = ColumnString::create();
+    auto serialized_value_column = ColumnString::create();
+    {
+        auto & serialized_key_data = serialized_key_column->getChars();
+        auto & serialized_key_offsets = serialized_key_column->getOffsets();
+        auto & serialized_value_data = serialized_value_column->getChars();
+        auto & serialized_value_offsets = serialized_value_column->getOffsets();
+        serialized_key_offsets.reserve(rows);
+        serialized_value_offsets.reserve(rows);
+        // serialized_key_offsets.clear();
+        // serialized_value_offsets.clear();
+        // serialized_key_data.clear();
+        // serialized_value_data.clear();
+        WriteBufferFromVector<ColumnString::Chars> writer_key(serialized_key_data);
+        WriteBufferFromVector<ColumnString::Chars> writer_value(serialized_value_data);
+        for (size_t i = 0; i < rows; ++i)
+        {
+            for (size_t idx = 0; idx < columns.size(); ++idx)
+                serializations[idx]->serializeBinary(*columns[idx], i, idx == primary_key_pos ? writer_key : writer_value, {});
+            writeChar('\0', writer_key);
+            writeChar('\0', writer_value);
+            serialized_key_offsets.emplace_back(writer_key.count());
+            serialized_value_offsets.emplace_back(writer_value.count());
+        }
+        writer_key.finalize();
+        writer_value.finalize();
+    }
+
+    auto path = getTemporarySSTFilePath();
+    if (auto status = buildSSTFile(path, *serialized_key_column, *serialized_value_column); !status.ok())
+        throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
+
+    rocksdb::IngestExternalFileOptions ingest_options;
+    ingest_options.move_files = true; /// The temporary file is on the same disk, so move (or hardlink) file will be faster than copy
+    if (auto status = storage.rocksdb_ptr->IngestExternalFile({path}, rocksdb::IngestExternalFileOptions()); !status.ok())
+        throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
+
+    if (fs::exists(path))
+        fs::remove(path);
+}
+
+String EmbeddedRocksDBBulkSink::getTemporarySSTFilePath()
+{
+    return fs::path(insert_directory_queue) / (toString(file_counter++) + ".sst");
+}
+
+}
diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
new file mode 100644
index 00000000000..312ad4bd93c
--- /dev/null
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <condition_variable>
+#include <stdatomic.h>
+#include <Processors/Sinks/SinkToStorage.h>
+#include <rocksdb/db.h>
+#include <rocksdb/status.h>
+#include <Common/CurrentThread.h>
+#include <Common/ThreadStatus.h>
+#include <Common/ThreadPool.h>
+#include <Columns/ColumnString.h>
+#include <IO/WriteBufferFromVector.h>
+
+
+namespace DB
+{
+
+class StorageEmbeddedRocksDB;
+class EmbeddedRocksDBBulkSink;
+struct StorageInMemoryMetadata;
+using StorageMetadataPtr = std::shared_ptr<const StorageInMemoryMetadata>;
+
+/// Optimized for bulk importing into StorageEmbeddedRocksDB:
+/// 1. No mem-table: an SST file is built from chunk, then import to rocksdb
+/// 2. Overlap compute and IO: one thread prepare rocksdb data from chunk, and another thread to write the data to SST file
+class EmbeddedRocksDBBulkSink : public SinkToStorage, public WithContext
+{
+public:
+    EmbeddedRocksDBBulkSink(
+        ContextPtr context_,
+        StorageEmbeddedRocksDB & storage_,
+        const StorageMetadataPtr & metadata_snapshot_);
+
+    ~EmbeddedRocksDBBulkSink() override;
+
+    void consume(Chunk chunk) override;
+    String getName() const override { return "EmbeddedRocksDBBulkSink"; }
+
+private:
+
+    String getTemporarySSTFilePath();
+
+    std::atomic_size_t file_counter = 0;
+    StorageEmbeddedRocksDB & storage;
+    StorageMetadataPtr metadata_snapshot;
+    size_t primary_key_pos = 0;
+    Serializations serializations;
+    String insert_directory_queue;
+
+    // /// Columns to hold key-value pairs, reused for all `consume` calls
+    // /// to reduce memory re-allocations
+    // ColumnString::MutablePtr serialized_key_column;
+    // ColumnString::MutablePtr serialized_value_column;
+};
+
+}
diff --git a/src/Storages/RocksDB/RocksDBSettings.cpp b/src/Storages/RocksDB/RocksDBSettings.cpp
new file mode 100644
index 00000000000..7de2077eb47
--- /dev/null
+++ b/src/Storages/RocksDB/RocksDBSettings.cpp
@@ -0,0 +1,41 @@
+#include "RocksDBSettings.h"
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/ASTFunction.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNKNOWN_SETTING;
+}
+
+IMPLEMENT_SETTINGS_TRAITS(RockDBSettingsTraits, LIST_OF_ROCKSDB_SETTINGS)
+
+
+void RocksDBSettings::loadFromQuery(ASTStorage & storage_def, ContextPtr /*context*/)
+{
+    if (storage_def.settings)
+    {
+        try
+        {
+            auto changes = storage_def.settings->changes;
+            applyChanges(changes);
+        }
+        catch (Exception & e)
+        {
+            if (e.code() == ErrorCodes::UNKNOWN_SETTING)
+                e.addMessage("for storage " + storage_def.engine->name);
+            throw;
+        }
+    }
+}
+
+std::vector<String> RocksDBSettings::getAllRegisteredNames() const
+{
+    std::vector<String> all_settings;
+    for (const auto & setting_field : all())
+        all_settings.push_back(setting_field.getName());
+    return all_settings;
+}
+}
diff --git a/src/Storages/RocksDB/RocksDBSettings.h b/src/Storages/RocksDB/RocksDBSettings.h
new file mode 100644
index 00000000000..f3d16bc4b3e
--- /dev/null
+++ b/src/Storages/RocksDB/RocksDBSettings.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <Core/BaseSettings.h>
+#include <Core/Defines.h>
+#include <Interpreters/Context_fwd.h>
+#include <base/unit.h>
+#include <Common/NamePrompter.h>
+
+
+namespace Poco::Util
+{
+class AbstractConfiguration;
+}
+
+
+namespace DB
+{
+class ASTStorage;
+struct Settings;
+
+
+/** StorageEmbeddedRocksdb table settings
+  */
+
+#define ROCKSDB_SETTINGS(M, ALIAS) \
+    M(Bool, optimize_for_bulk_insert, true, "Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing " \
+      "to memtables)", 0)
+
+#define LIST_OF_ROCKSDB_SETTINGS(M, ALIAS) ROCKSDB_SETTINGS(M, ALIAS)
+
+DECLARE_SETTINGS_TRAITS(RockDBSettingsTraits, LIST_OF_ROCKSDB_SETTINGS)
+
+struct RocksDBSettings : public BaseSettings<RockDBSettingsTraits>, public IHints<2>
+{
+    void loadFromQuery(ASTStorage & storage_def, ContextPtr context);
+    std::vector<String> getAllRegisteredNames() const override;
+};
+
+}
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
index 4ead714c740..3bfeb561408 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
@@ -1,6 +1,5 @@
 #include <Storages/checkAndGetLiteralArgument.h>
 #include <Storages/RocksDB/StorageEmbeddedRocksDB.h>
-#include <Storages/RocksDB/EmbeddedRocksDBSink.h>
 #include <Storages/MutationCommands.h>
 
 #include <DataTypes/DataTypesNumber.h>
@@ -28,8 +27,14 @@
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Common/logger_useful.h>
 #include <Common/Exception.h>
+#include <Storages/RocksDB/RocksDBSettings.h>
+#include <IO/SharedThreadPools.h>
+#include <Disks/DiskLocal.h>
 #include <base/sort.h>
 
+#include <rocksdb/advanced_options.h>
+#include <rocksdb/env.h>
+#include <rocksdb/options.h>
 #include <rocksdb/table.h>
 #include <rocksdb/convenience.h>
 #include <rocksdb/utilities/db_ttl.h>
@@ -174,6 +179,7 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_,
         const StorageInMemoryMetadata & metadata_,
         bool attach,
         ContextPtr context_,
+        RocksDBSettings settings_,
         const String & primary_key_,
         Int32 ttl_,
         String rocksdb_dir_,
@@ -186,6 +192,7 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_,
     , read_only(read_only_)
 {
     setInMemoryMetadata(metadata_);
+    setSettings(std::move(settings_));
     if (rocksdb_dir.empty())
     {
         rocksdb_dir = context_->getPath() + relative_data_path_;
@@ -234,22 +241,20 @@ void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPt
 
     if (commands.front().type == MutationCommand::Type::DELETE)
     {
-        MutationsInterpreter::Settings settings(true);
-        settings.return_all_columns = true;
-        settings.return_mutated_rows = true;
+        MutationsInterpreter::Settings mutation_settings(true);
+        mutation_settings.return_all_columns = true;
+        mutation_settings.return_mutated_rows = true;
 
         auto interpreter = std::make_unique<MutationsInterpreter>(
             storage_ptr,
             metadata_snapshot,
             commands,
             context_,
-            settings);
+            mutation_settings);
 
         auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
         PullingPipelineExecutor executor(pipeline);
 
-        auto sink = std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
-
         auto header = interpreter->getUpdatedHeader();
         auto primary_key_pos = header.getPositionByName(primary_key);
 
@@ -285,16 +290,16 @@ void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPt
     if (commands.front().column_to_update_expression.contains(primary_key))
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated (cannot update column {})", primary_key);
 
-    MutationsInterpreter::Settings settings(true);
-    settings.return_all_columns = true;
-    settings.return_mutated_rows = true;
+    MutationsInterpreter::Settings mutation_settings(true);
+    mutation_settings.return_all_columns = true;
+    mutation_settings.return_mutated_rows = true;
 
     auto interpreter = std::make_unique<MutationsInterpreter>(
         storage_ptr,
         metadata_snapshot,
         commands,
         context_,
-        settings);
+        mutation_settings);
 
     auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
     PullingPipelineExecutor executor(pipeline);
@@ -350,7 +355,6 @@ void StorageEmbeddedRocksDB::initDB()
     rocksdb::Options base;
 
     base.create_if_missing = true;
-    base.compression = rocksdb::CompressionType::kZSTD;
     base.statistics = rocksdb::CreateDBStatistics();
     /// It is too verbose by default, and in fact we don't care about rocksdb logs at all.
     base.info_log_level = rocksdb::ERROR_LEVEL;
@@ -590,8 +594,11 @@ void ReadFromEmbeddedRocksDB::applyFilters()
 }
 
 SinkToStoragePtr StorageEmbeddedRocksDB::write(
-    const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/, bool /*async_insert*/)
+    const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr  query_context, bool /*async_insert*/)
 {
+    if (getSettings().optimize_for_bulk_insert)
+        return std::make_shared<EmbeddedRocksDBBulkSink>(query_context, *this, metadata_snapshot);
+
     return std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
 }
 
@@ -630,7 +637,9 @@ static StoragePtr create(const StorageFactory::Arguments & args)
     {
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "StorageEmbeddedRocksDB must require one column in primary key");
     }
-    return std::make_shared<StorageEmbeddedRocksDB>(args.table_id, args.relative_data_path, metadata, args.attach, args.getContext(), primary_key_names[0], ttl, std::move(rocksdb_dir), read_only);
+    RocksDBSettings settings;
+    settings.loadFromQuery(*args.storage_def, args.getContext());
+    return std::make_shared<StorageEmbeddedRocksDB>(args.table_id, args.relative_data_path, metadata, args.attach, args.getContext(), std::move(settings), primary_key_names[0], ttl, std::move(rocksdb_dir), read_only);
 }
 
 std::shared_ptr<rocksdb::Statistics> StorageEmbeddedRocksDB::getRocksDBStatistics() const
@@ -721,9 +730,9 @@ Chunk StorageEmbeddedRocksDB::getBySerializedKeys(
     return Chunk(std::move(columns), num_rows);
 }
 
-std::optional<UInt64> StorageEmbeddedRocksDB::totalRows(const Settings & settings) const
+std::optional<UInt64> StorageEmbeddedRocksDB::totalRows(const Settings & query_settings) const
 {
-    if (!settings.optimize_trivial_approximate_count_query)
+    if (!query_settings.optimize_trivial_approximate_count_query)
         return {};
     std::shared_lock lock(rocksdb_ptr_mx);
     if (!rocksdb_ptr)
@@ -748,6 +757,7 @@ std::optional<UInt64> StorageEmbeddedRocksDB::totalBytes(const Settings & /*sett
 void registerStorageEmbeddedRocksDB(StorageFactory & factory)
 {
     StorageFactory::StorageFeatures features{
+        .supports_settings = true,
         .supports_sort_order = true,
         .supports_ttl = true,
         .supports_parallel_insert = true,
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
index b09dfca7338..fff7ed3644f 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
@@ -2,10 +2,12 @@
 
 #include <memory>
 #include <Common/SharedMutex.h>
-#include <Storages/IStorage.h>
 #include <Interpreters/IKeyValueEntity.h>
 #include <rocksdb/status.h>
+#include <Storages/IStorage.h>
 #include <Storages/RocksDB/EmbeddedRocksDBSink.h>
+#include <Storages/RocksDB/EmbeddedRocksDBBulkSink.h>
+#include <Storages/RocksDB/RocksDBSettings.h>
 
 
 namespace rocksdb
@@ -27,6 +29,7 @@ class Context;
 class StorageEmbeddedRocksDB final : public IStorage, public IKeyValueEntity, WithContext
 {
     friend class EmbeddedRocksDBSink;
+    friend class EmbeddedRocksDBBulkSink;
     friend class ReadFromEmbeddedRocksDB;
 public:
     StorageEmbeddedRocksDB(const StorageID & table_id_,
@@ -34,6 +37,7 @@ public:
         const StorageInMemoryMetadata & metadata,
         bool attach,
         ContextPtr context_,
+        RocksDBSettings settings_,
         const String & primary_key_,
         Int32 ttl_ = 0,
         String rocksdb_dir_ = "",
@@ -97,7 +101,12 @@ public:
 
     std::optional<UInt64> totalBytes(const Settings & settings) const override;
 
+    const RocksDBSettings & getSettings() const { return settings; }
+
+    void setSettings(RocksDBSettings settings_) { settings = std::move(settings_); }
+
 private:
+    RocksDBSettings settings;
     const String primary_key;
     using RocksDBPtr = std::unique_ptr<rocksdb::DB>;
     RocksDBPtr rocksdb_ptr;
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
new file mode 100644
index 00000000000..83b33d238da
--- /dev/null
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
@@ -0,0 +1 @@
+1000
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
new file mode 100644
index 00000000000..cfb97b049bf
--- /dev/null
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
@@ -0,0 +1,3 @@
+CREATE TABLE rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 1;
+INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000) SETTINGS max_insert_threads = 2;
+SELECT count() FROM rocksdb_worm;

From 8218657cbc5bc997c391fd42de2d9fb7cc66fec7 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Thu, 25 Jan 2024 10:14:43 +0100
Subject: [PATCH 002/192] Add optimize_for_bulk_insert to embedded-rocksdb.md

---
 .../engines/table-engines/integrations/embedded-rocksdb.md   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
index 44febe78c77..c880ad7253c 100644
--- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
+++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
@@ -17,6 +17,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
     name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
     ...
 ) ENGINE = EmbeddedRocksDB([ttl, rocksdb_dir, read_only]) PRIMARY KEY(primary_key_name)
+[ SETTINGS optimize_for_bulk_insert = (0|1)]
 ```
 
 Engine parameters:
@@ -29,6 +30,10 @@ Engine parameters:
 - columns other than the primary key will be serialized in binary as `rocksdb` value in corresponding order.
 - queries with key `equals` or `in` filtering will be optimized to multi keys lookup from `rocksdb`.
 
+Engine settings:
+
+- `optimize_for_bulk_insert` – Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing to memtables).
+
 Example:
 
 ``` sql

From cce998e875fa7097a8d49d1193b684d52fc9a00d Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Thu, 25 Jan 2024 14:15:01 +0000
Subject: [PATCH 003/192] clean up and add some comments

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../RocksDB/EmbeddedRocksDBBulkSink.cpp       | 20 +++++++++----------
 .../RocksDB/EmbeddedRocksDBBulkSink.h         |  8 ++------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index 1c5d48fe62d..d25bfc50b22 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -42,6 +42,7 @@ static const IColumn::Permutation & getAscendingPermutation(const IColumn & colu
 
 static rocksdb::Status buildSSTFile(const String & path, const ColumnString & keys, const ColumnString & values, const std::optional<IColumn::Permutation> & perm_ = {})
 {
+    /// rocksdb::SstFileWriter requires keys to be sorted in ascending order
     IColumn::Permutation calculated_perm;
     const IColumn::Permutation & perm = perm_ ? *perm_ : getAscendingPermutation(keys, calculated_perm);
 
@@ -63,6 +64,7 @@ static rocksdb::Status buildSSTFile(const String & path, const ColumnString & ke
         if (!status.ok() && !status.IsInvalidArgument())
             return status;
     }
+
     sst_file_writer.Finish();
     return rocksdb::Status::OK();
 }
@@ -80,13 +82,8 @@ EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink(
     serializations = getHeader().getSerializations();
     /// If max_insert_threads > 1 we may have multiple EmbeddedRocksDBBulkSink and getContext()->getCurrentQueryId() is not guarantee to
     /// to have a distinct path
-    insert_directory_queue = fs::path(storage.getDataPaths()[0]) / (getContext()->getCurrentQueryId() + "_" + getRandomASCIIString(8));
+    insert_directory_queue = fs::path(storage.getDataPaths()[0]) / (getContext()->getCurrentQueryId() + "-" + getRandomASCIIString(8));
     fs::create_directory(insert_directory_queue);
-
-    // serialized_key_column = ColumnString::create();
-    // serialized_value_column = ColumnString::create();
-    // writer_key = std::make_unique<WriteBufferFromVector<ColumnString::Chars>>(serialized_key_column->getChars());
-    // writer_value = std::make_unique<WriteBufferFromVector<ColumnString::Chars>>(serialized_value_column->getChars());
 }
 
 EmbeddedRocksDBBulkSink::~EmbeddedRocksDBBulkSink()
@@ -100,21 +97,21 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk)
     auto rows = chunk.getNumRows();
     const auto columns = chunk.detachColumns();
 
+    /// Convert chunk to rocksdb key-value pairs
     auto serialized_key_column = ColumnString::create();
     auto serialized_value_column = ColumnString::create();
+
     {
         auto & serialized_key_data = serialized_key_column->getChars();
         auto & serialized_key_offsets = serialized_key_column->getOffsets();
         auto & serialized_value_data = serialized_value_column->getChars();
         auto & serialized_value_offsets = serialized_value_column->getOffsets();
+
         serialized_key_offsets.reserve(rows);
         serialized_value_offsets.reserve(rows);
-        // serialized_key_offsets.clear();
-        // serialized_value_offsets.clear();
-        // serialized_key_data.clear();
-        // serialized_value_data.clear();
         WriteBufferFromVector<ColumnString::Chars> writer_key(serialized_key_data);
         WriteBufferFromVector<ColumnString::Chars> writer_value(serialized_value_data);
+
         for (size_t i = 0; i < rows; ++i)
         {
             for (size_t idx = 0; idx < columns.size(); ++idx)
@@ -124,14 +121,17 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk)
             serialized_key_offsets.emplace_back(writer_key.count());
             serialized_value_offsets.emplace_back(writer_value.count());
         }
+
         writer_key.finalize();
         writer_value.finalize();
     }
 
+    /// Build SST file from key-value pairs
     auto path = getTemporarySSTFilePath();
     if (auto status = buildSSTFile(path, *serialized_key_column, *serialized_value_column); !status.ok())
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
 
+    /// Ingest the SST file
     rocksdb::IngestExternalFileOptions ingest_options;
     ingest_options.move_files = true; /// The temporary file is on the same disk, so move (or hardlink) file will be faster than copy
     if (auto status = storage.rocksdb_ptr->IngestExternalFile({path}, rocksdb::IngestExternalFileOptions()); !status.ok())
diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
index 312ad4bd93c..ed6a8068683 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
@@ -34,10 +34,11 @@ public:
     ~EmbeddedRocksDBBulkSink() override;
 
     void consume(Chunk chunk) override;
+
     String getName() const override { return "EmbeddedRocksDBBulkSink"; }
 
 private:
-
+    /// Get a unique path to write temporary SST file
     String getTemporarySSTFilePath();
 
     std::atomic_size_t file_counter = 0;
@@ -46,11 +47,6 @@ private:
     size_t primary_key_pos = 0;
     Serializations serializations;
     String insert_directory_queue;
-
-    // /// Columns to hold key-value pairs, reused for all `consume` calls
-    // /// to reduce memory re-allocations
-    // ColumnString::MutablePtr serialized_key_column;
-    // ColumnString::MutablePtr serialized_value_column;
 };
 
 }

From 84efec443be2d05ba16d136533e6a3fd9ce598be Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Fri, 26 Jan 2024 01:28:26 +0000
Subject: [PATCH 004/192] fix document spell check

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index 73b7a081797..49acc6b8bfb 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -1858,6 +1858,8 @@ mdadm
 meanZTest
 meanztest
 mebibytes
+memtable
+memtables
 mergeable
 mergetree
 messageID

From b99fd655323de3da1ad5f96247261f8db17b6ff4 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Fri, 26 Jan 2024 04:58:56 +0000
Subject: [PATCH 005/192] no rocksdb in fast test

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
index cfb97b049bf..69879a5e6b1 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
@@ -1,3 +1,4 @@
+-- Tags: no-fasttest
 CREATE TABLE rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 1;
 INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000) SETTINGS max_insert_threads = 2;
 SELECT count() FROM rocksdb_worm;

From ff9a9e51564b6272b78d76d2ca87e51502fa4ea1 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Fri, 26 Jan 2024 13:25:56 +0000
Subject: [PATCH 006/192] fix tests

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 tests/queries/0_stateless/01686_rocksdb.sql           | 2 +-
 tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/01686_rocksdb.sql b/tests/queries/0_stateless/01686_rocksdb.sql
index f3177ce140e..3ff218bf398 100644
--- a/tests/queries/0_stateless/01686_rocksdb.sql
+++ b/tests/queries/0_stateless/01686_rocksdb.sql
@@ -4,7 +4,7 @@
 
 DROP TABLE IF EXISTS 01686_test;
 
-CREATE TABLE 01686_test (key UInt64, value String) Engine=EmbeddedRocksDB PRIMARY KEY(key);
+CREATE TABLE 01686_test (key UInt64, value String) Engine=EmbeddedRocksDB PRIMARY KEY(key) SETTINGS optimize_for_bulk_insert = 0;
 
 SELECT value FROM system.rocksdb WHERE database = currentDatabase() and table = '01686_test' and name = 'number.keys.written';
 INSERT INTO 01686_test SELECT number, format('Hello, world ({})', toString(number)) FROM numbers(10000);
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
index 69879a5e6b1..01a014ddfe1 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
@@ -1,4 +1,6 @@
--- Tags: no-fasttest
+-- Tags: no-ordinary-database, no-fasttest
+-- Tag no-ordinary-database: Sometimes cannot lock file most likely due to concurrent or adjacent tests, but we don't care how it works in Ordinary database
+-- Tag no-fasttest: In fasttest, ENABLE_LIBRARIES=0, so rocksdb engine is not enabled by default
 CREATE TABLE rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 1;
 INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000) SETTINGS max_insert_threads = 2;
 SELECT count() FROM rocksdb_worm;

From 2cd0d51b6947f0b7c4f62a8626ed164417cfe8a3 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Tue, 30 Jan 2024 02:22:07 +0000
Subject: [PATCH 007/192] hotfix rocksdb bulk sink fails with user defined
 query id

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index d25bfc50b22..ad54970df5f 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -18,6 +18,7 @@
 #include <rocksdb/options.h>
 #include <rocksdb/status.h>
 #include <rocksdb/utilities/db_ttl.h>
+#include <Common/SipHash.h>
 #include <Common/getRandomASCIIString.h>
 #include <Common/CurrentThread.h>
 #include <Common/MemoryTrackerBlockerInThread.h>
@@ -81,8 +82,10 @@ EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink(
     }
     serializations = getHeader().getSerializations();
     /// If max_insert_threads > 1 we may have multiple EmbeddedRocksDBBulkSink and getContext()->getCurrentQueryId() is not guarantee to
-    /// to have a distinct path
-    insert_directory_queue = fs::path(storage.getDataPaths()[0]) / (getContext()->getCurrentQueryId() + "-" + getRandomASCIIString(8));
+    /// to have a distinct path. Also we cannot use query id as directory name here, because it could be defined by user and not suitable
+    /// for directory name
+    auto base_directory_name = sipHash128String(getContext()->getCurrentQueryId());
+    insert_directory_queue = fs::path(storage.getDataPaths()[0]) / (base_directory_name + "-" + getRandomASCIIString(8));
     fs::create_directory(insert_directory_queue);
 }
 

From fce0cca924ded39196d338bd57416340ac75aa84 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Tue, 30 Jan 2024 08:37:41 +0000
Subject: [PATCH 008/192] hotfix: handling empty chunk

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index ad54970df5f..309229f2931 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -98,6 +98,10 @@ EmbeddedRocksDBBulkSink::~EmbeddedRocksDBBulkSink()
 void EmbeddedRocksDBBulkSink::consume(Chunk chunk)
 {
     auto rows = chunk.getNumRows();
+
+    if (rows == 0) /// TODO: squashing if rows are too small
+        return;
+
     const auto columns = chunk.detachColumns();
 
     /// Convert chunk to rocksdb key-value pairs

From ffa7784a3cc89f0fc9d55a9f67dd1a5105ce430d Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Wed, 31 Jan 2024 02:11:17 +0000
Subject: [PATCH 009/192] rocksdb: enforce batch size for bulk insert

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../RocksDB/EmbeddedRocksDBBulkSink.cpp       | 113 +++++++++++++++---
 .../RocksDB/EmbeddedRocksDBBulkSink.h         |  19 ++-
 src/Storages/RocksDB/RocksDBSettings.h        |   4 +-
 3 files changed, 115 insertions(+), 21 deletions(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index 309229f2931..0654a1dd33d 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -41,6 +41,7 @@ static const IColumn::Permutation & getAscendingPermutation(const IColumn & colu
     return perm;
 }
 
+/// Build SST file from key-value pairs
 static rocksdb::Status buildSSTFile(const String & path, const ColumnString & keys, const ColumnString & values, const std::optional<IColumn::Permutation> & perm_ = {})
 {
     /// rocksdb::SstFileWriter requires keys to be sorted in ascending order
@@ -80,7 +81,10 @@ EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink(
             break;
         ++primary_key_pos;
     }
+
     serializations = getHeader().getSerializations();
+    min_block_size_rows = std::max(storage.getSettings().bulk_insert_block_size, getContext()->getSettingsRef().min_insert_block_size_rows);
+
     /// If max_insert_threads > 1 we may have multiple EmbeddedRocksDBBulkSink and getContext()->getCurrentQueryId() is not guarantee to
     /// to have a distinct path. Also we cannot use query id as directory name here, because it could be defined by user and not suitable
     /// for directory name
@@ -95,16 +99,59 @@ EmbeddedRocksDBBulkSink::~EmbeddedRocksDBBulkSink()
         fs::remove_all(insert_directory_queue);
 }
 
-void EmbeddedRocksDBBulkSink::consume(Chunk chunk)
+std::vector<Chunk> EmbeddedRocksDBBulkSink::squash(Chunk chunk)
 {
-    auto rows = chunk.getNumRows();
+    /// End of input stream
+    if (chunk.getNumRows() == 0)
+    {
+        if (chunks.empty())
+            return {};
+        std::vector<Chunk> to_return;
+        std::swap(to_return, chunks);
+        return to_return;
+    }
 
-    if (rows == 0) /// TODO: squashing if rows are too small
-        return;
+    /// Just read block is already enough.
+    if (isEnoughSize(chunk))
+    {
+        /// If no accumulated data, return just read block.
+        if (chunks.empty())
+        {
+            chunks.emplace_back(std::move(chunk));
+            return {};
+        }
 
-    const auto columns = chunk.detachColumns();
+        /// Return accumulated data (maybe it has small size) and place new block to accumulated data.
+        std::vector<Chunk> to_return;
+        std::swap(to_return, chunks);
+        chunks.emplace_back(std::move(chunk));
+        return to_return;
+    }
 
-    /// Convert chunk to rocksdb key-value pairs
+    /// Accumulated block is already enough.
+    if (isEnoughSize(chunks))
+    {
+        /// Return accumulated data and place new block to accumulated data.
+        std::vector<Chunk> to_return;
+        std::swap(to_return, chunks);
+        chunks.emplace_back(std::move(chunk));
+        return to_return;
+    }
+
+    chunks.emplace_back(std::move(chunk));
+    if (isEnoughSize(chunks))
+    {
+        std::vector<Chunk> to_return;
+        std::swap(to_return, chunks);
+        return to_return;
+    }
+
+    /// Squashed block is not ready.
+    return {};
+}
+
+std::pair<ColumnString::Ptr, ColumnString::Ptr> EmbeddedRocksDBBulkSink::serializeChunks(const std::vector<Chunk> & input_chunks) const
+{
     auto serialized_key_column = ColumnString::create();
     auto serialized_value_column = ColumnString::create();
 
@@ -113,27 +160,39 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk)
         auto & serialized_key_offsets = serialized_key_column->getOffsets();
         auto & serialized_value_data = serialized_value_column->getChars();
         auto & serialized_value_offsets = serialized_value_column->getOffsets();
-
-        serialized_key_offsets.reserve(rows);
-        serialized_value_offsets.reserve(rows);
         WriteBufferFromVector<ColumnString::Chars> writer_key(serialized_key_data);
         WriteBufferFromVector<ColumnString::Chars> writer_value(serialized_value_data);
 
-        for (size_t i = 0; i < rows; ++i)
+        for (const auto & chunk : input_chunks)
         {
-            for (size_t idx = 0; idx < columns.size(); ++idx)
-                serializations[idx]->serializeBinary(*columns[idx], i, idx == primary_key_pos ? writer_key : writer_value, {});
-            writeChar('\0', writer_key);
-            writeChar('\0', writer_value);
-            serialized_key_offsets.emplace_back(writer_key.count());
-            serialized_value_offsets.emplace_back(writer_value.count());
+            const auto & columns = chunk.getColumns();
+            auto rows = chunk.getNumRows();
+            for (size_t i = 0; i < rows; ++i)
+            {
+                for (size_t idx = 0; idx < columns.size(); ++idx)
+                    serializations[idx]->serializeBinary(*columns[idx], i, idx == primary_key_pos ? writer_key : writer_value, {});
+                writeChar('\0', writer_key);
+                writeChar('\0', writer_value);
+                serialized_key_offsets.emplace_back(writer_key.count());
+                serialized_value_offsets.emplace_back(writer_value.count());
+            }
         }
 
         writer_key.finalize();
         writer_value.finalize();
     }
 
-    /// Build SST file from key-value pairs
+    return {std::move(serialized_key_column), std::move(serialized_value_column)};
+}
+
+void EmbeddedRocksDBBulkSink::consume(Chunk chunk_)
+{
+    std::vector<Chunk> to_written = squash(std::move(chunk_));
+
+    if (to_written.empty())
+        return;
+
+    auto [serialized_key_column, serialized_value_column] = serializeChunks(to_written);
     auto path = getTemporarySSTFilePath();
     if (auto status = buildSSTFile(path, *serialized_key_column, *serialized_value_column); !status.ok())
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
@@ -148,9 +207,29 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk)
         fs::remove(path);
 }
 
+void EmbeddedRocksDBBulkSink::onFinish()
+{
+    /// If there is any data left, write it.
+    if (!chunks.empty())
+        consume({});
+}
+
+
 String EmbeddedRocksDBBulkSink::getTemporarySSTFilePath()
 {
     return fs::path(insert_directory_queue) / (toString(file_counter++) + ".sst");
 }
 
+bool EmbeddedRocksDBBulkSink::isEnoughSize(const std::vector<Chunk> & input_chunks) const
+{
+    size_t total_rows = 0;
+    for (const auto & chunk : input_chunks)
+        total_rows += chunk.getNumRows();
+    return total_rows >= min_block_size_rows;
+}
+
+bool EmbeddedRocksDBBulkSink::isEnoughSize(const Chunk & chunk) const
+{
+    return chunk.getNumRows() >= min_block_size_rows;
+}
 }
diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
index ed6a8068683..fe28576a4a3 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
@@ -9,7 +9,7 @@
 #include <Common/ThreadStatus.h>
 #include <Common/ThreadPool.h>
 #include <Columns/ColumnString.h>
-#include <IO/WriteBufferFromVector.h>
+#include <Processors/Chunk.h>
 
 
 namespace DB
@@ -35,17 +35,32 @@ public:
 
     void consume(Chunk chunk) override;
 
+    void onFinish() override;
+
     String getName() const override { return "EmbeddedRocksDBBulkSink"; }
 
 private:
     /// Get a unique path to write temporary SST file
     String getTemporarySSTFilePath();
 
-    std::atomic_size_t file_counter = 0;
+    /// Squash chunks to a minimum size
+    std::vector<Chunk> squash(Chunk chunk);
+    bool isEnoughSize(const std::vector<Chunk> & input_chunks) const;
+    bool isEnoughSize(const Chunk & chunk) const;
+    /// Serialize chunks to rocksdb key-value pairs
+    std::pair<ColumnString::Ptr, ColumnString::Ptr> serializeChunks(const std::vector<Chunk> & input_chunks) const;
+
     StorageEmbeddedRocksDB & storage;
     StorageMetadataPtr metadata_snapshot;
     size_t primary_key_pos = 0;
     Serializations serializations;
+
+    /// For squashing chunks
+    std::vector<Chunk> chunks;
+    size_t min_block_size_rows = 0;
+
+    /// For writing SST files
+    std::atomic_size_t file_counter = 0;
     String insert_directory_queue;
 };
 
diff --git a/src/Storages/RocksDB/RocksDBSettings.h b/src/Storages/RocksDB/RocksDBSettings.h
index f3d16bc4b3e..ccfa8449187 100644
--- a/src/Storages/RocksDB/RocksDBSettings.h
+++ b/src/Storages/RocksDB/RocksDBSettings.h
@@ -23,8 +23,8 @@ struct Settings;
   */
 
 #define ROCKSDB_SETTINGS(M, ALIAS) \
-    M(Bool, optimize_for_bulk_insert, true, "Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing " \
-      "to memtables)", 0)
+    M(Bool, optimize_for_bulk_insert, true, "Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing to memtables)", 0) \
+    M(UInt64, bulk_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "Size of block for bulk insert, if it's smaller than query setting min_insert_block_size_rows then it will be overriden by min_insert_block_size_rows", 0) \
 
 #define LIST_OF_ROCKSDB_SETTINGS(M, ALIAS) ROCKSDB_SETTINGS(M, ALIAS)
 

From 3b8b5f7403fba7b090c67838d9b34722160750ac Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Wed, 31 Jan 2024 15:17:15 +0000
Subject: [PATCH 010/192] fix typo

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Storages/RocksDB/RocksDBSettings.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/RocksDB/RocksDBSettings.h b/src/Storages/RocksDB/RocksDBSettings.h
index ccfa8449187..1b168c56d89 100644
--- a/src/Storages/RocksDB/RocksDBSettings.h
+++ b/src/Storages/RocksDB/RocksDBSettings.h
@@ -24,7 +24,7 @@ struct Settings;
 
 #define ROCKSDB_SETTINGS(M, ALIAS) \
     M(Bool, optimize_for_bulk_insert, true, "Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing to memtables)", 0) \
-    M(UInt64, bulk_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "Size of block for bulk insert, if it's smaller than query setting min_insert_block_size_rows then it will be overriden by min_insert_block_size_rows", 0) \
+    M(UInt64, bulk_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "Size of block for bulk insert, if it's smaller than query setting min_insert_block_size_rows then it will be overridden by min_insert_block_size_rows", 0) \
 
 #define LIST_OF_ROCKSDB_SETTINGS(M, ALIAS) ROCKSDB_SETTINGS(M, ALIAS)
 

From d8175451db35db8a17a52e016c6b4c4496890eb9 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Fri, 2 Feb 2024 13:50:50 +0800
Subject: [PATCH 011/192] Apply suggestions from code review

Co-authored-by: vdimir <vdimir@clickhouse.com>
---
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp | 16 +++++++++++-----
 .../0_stateless/02956_rocksdb_bulk_sink.sql      |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index 0654a1dd33d..6d9c320684d 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -54,7 +54,6 @@ static rocksdb::Status buildSSTFile(const String & path, const ColumnString & ke
         return status;
 
     auto rows = perm.size();
-    WriteBufferFromOwnString wb_value;
     for (size_t i = 0; i < rows; ++i)
     {
         auto row = perm[i];
@@ -95,8 +94,15 @@ EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink(
 
 EmbeddedRocksDBBulkSink::~EmbeddedRocksDBBulkSink()
 {
-    if (fs::exists(insert_directory_queue))
-        fs::remove_all(insert_directory_queue);
+    try
+    {
+        if (fs::exists(insert_directory_queue))
+            fs::remove_all(insert_directory_queue);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
 }
 
 std::vector<Chunk> EmbeddedRocksDBBulkSink::squash(Chunk chunk)
@@ -193,14 +199,14 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk_)
         return;
 
     auto [serialized_key_column, serialized_value_column] = serializeChunks(to_written);
-    auto path = getTemporarySSTFilePath();
+    auto sst_file_path = getTemporarySSTFilePath();
     if (auto status = buildSSTFile(path, *serialized_key_column, *serialized_value_column); !status.ok())
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
 
     /// Ingest the SST file
     rocksdb::IngestExternalFileOptions ingest_options;
     ingest_options.move_files = true; /// The temporary file is on the same disk, so move (or hardlink) file will be faster than copy
-    if (auto status = storage.rocksdb_ptr->IngestExternalFile({path}, rocksdb::IngestExternalFileOptions()); !status.ok())
+    if (auto status = storage.rocksdb_ptr->IngestExternalFile({path}, ingest_options); !status.ok())
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
 
     if (fs::exists(path))
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
index 01a014ddfe1..d685afadf81 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
@@ -1,4 +1,4 @@
--- Tags: no-ordinary-database, no-fasttest
+-- Tags: no-ordinary-database, use-rocksdb
 -- Tag no-ordinary-database: Sometimes cannot lock file most likely due to concurrent or adjacent tests, but we don't care how it works in Ordinary database
 -- Tag no-fasttest: In fasttest, ENABLE_LIBRARIES=0, so rocksdb engine is not enabled by default
 CREATE TABLE rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 1;

From bb55a0ca50fe2a93829e249df1f5f342ed11b4d2 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Mon, 11 Mar 2024 19:18:15 +0100
Subject: [PATCH 012/192] Less contention in cache (Part 4)

---
 src/Interpreters/Cache/EvictionCandidates.h   | 10 +++
 src/Interpreters/Cache/FileCache.cpp          | 83 +++++++++++++++++++
 src/Interpreters/Cache/FileCache.h            |  8 ++
 src/Interpreters/Cache/FileCacheSettings.cpp  |  9 ++
 src/Interpreters/Cache/FileCacheSettings.h    |  4 +
 src/Interpreters/Cache/FileCache_fwd.h        |  3 +
 src/Interpreters/Cache/IFileCachePriority.h   |  9 ++
 .../Cache/LRUFileCachePriority.cpp            | 51 +++++++++---
 src/Interpreters/Cache/LRUFileCachePriority.h | 14 ++++
 .../Cache/SLRUFileCachePriority.cpp           | 31 +++++++
 .../Cache/SLRUFileCachePriority.h             |  7 ++
 tests/config/config.d/storage_conf.xml        |  2 +
 12 files changed, 221 insertions(+), 10 deletions(-)

diff --git a/src/Interpreters/Cache/EvictionCandidates.h b/src/Interpreters/Cache/EvictionCandidates.h
index 7859762be09..752d26031e8 100644
--- a/src/Interpreters/Cache/EvictionCandidates.h
+++ b/src/Interpreters/Cache/EvictionCandidates.h
@@ -7,10 +7,20 @@ namespace DB
 class EvictionCandidates
 {
 public:
+    EvictionCandidates() = default;
+    EvictionCandidates(EvictionCandidates && other) noexcept
+    {
+        candidates = std::move(other.candidates);
+        candidates_size = std::move(other.candidates_size);
+        invalidated_queue_entries = std::move(other.invalidated_queue_entries);
+        finalize_eviction_func = std::move(other.finalize_eviction_func);
+    }
     ~EvictionCandidates();
 
     void add(LockedKey & locked_key, const FileSegmentMetadataPtr & candidate);
 
+    void add(const EvictionCandidates & other, const CacheGuard::Lock &) { candidates.insert(other.candidates.begin(), other.candidates.end()); }
+
     void evict();
 
     void finalize(FileCacheQueryLimit::QueryContext * query_context, const CacheGuard::Lock & lock);
diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 90508d74554..bbaf3aebbee 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -85,6 +85,9 @@ FileCache::FileCache(const std::string & cache_name, const FileCacheSettings & s
     , boundary_alignment(settings.boundary_alignment)
     , load_metadata_threads(settings.load_metadata_threads)
     , write_cache_per_user_directory(settings.write_cache_per_user_id_directory)
+    , keep_current_size_to_max_ratio(1 - settings.keep_free_space_size_ratio)
+    , keep_current_elements_to_max_ratio(1 - settings.keep_free_space_elements_ratio)
+    , keep_up_free_space_remove_batch(settings.keep_free_space_remove_batch)
     , log(getLogger("FileCache(" + cache_name + ")"))
     , metadata(settings.base_path, settings.background_download_queue_size_limit, settings.background_download_threads, write_cache_per_user_directory)
 {
@@ -179,6 +182,10 @@ void FileCache::initialize()
     }
 
     metadata.startup();
+
+    if (keep_current_size_to_max_ratio != 1 || keep_current_elements_to_max_ratio != 1)
+        keep_up_free_space_ratio_task = Context::getGlobalContextInstance()->getSchedulePool().createTask(log->name(), [this] { freeSpaceRatioKeepingThreadFunc(); });
+
     is_initialized = true;
 }
 
@@ -884,6 +891,81 @@ bool FileCache::tryReserve(
     return true;
 }
 
+void FileCache::freeSpaceRatioKeepingThreadFunc()
+{
+    static constexpr auto lock_failed_reschedule_ms = 1000;
+    static constexpr auto space_ratio_satisfied_reschedule_ms = 5000;
+    static constexpr auto general_reschedule_ms = 5000;
+
+    while (true)
+    {
+        if (shutdown)
+            return;
+
+        auto lock = tryLockCache();
+        if (!lock)
+        {
+            keep_up_free_space_ratio_task->scheduleAfter(lock_failed_reschedule_ms);
+            return;
+        }
+
+        const size_t size_limit = main_priority->getSizeLimit(lock);
+        const size_t elements_limit = main_priority->getElementsLimit(lock);
+
+        const size_t desired_size = std::lround(keep_current_size_to_max_ratio * size_limit);
+        const size_t desired_elements_num = std::lround(keep_current_elements_to_max_ratio * elements_limit);
+
+        if ((size_limit == 0 || main_priority->getSize(lock) <= desired_size)
+            && (elements_limit == 0 || main_priority->getElementsCount(lock) <= desired_elements_num))
+        {
+            /// Nothing to free - all limits are satisfied.
+            keep_up_free_space_ratio_task->scheduleAfter(space_ratio_satisfied_reschedule_ms);
+            return;
+        }
+
+        try
+        {
+            FileCacheReserveStat stat;
+            auto eviction_candidates = main_priority->collectCandidatesForEviction(
+                desired_size, desired_elements_num, keep_up_free_space_remove_batch, stat, lock);
+
+            if (shutdown)
+                return;
+
+            if (eviction_candidates.size() == 0)
+            {
+                /// This case is impossible in realistic cache setup,
+                /// e.g. we should always be able to evict something.
+                keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
+                return;
+            }
+
+            LOG_TRACE(log, "Current usage {}/{} in size, {}/{} in elements count "
+                    "(trying to keep size ration at {} and elements ratio at {}). "
+                    "Collected {} eviction candidates, "
+                    "skipped {} candidates while iterating",
+                    main_priority->getSize(lock), size_limit,
+                    main_priority->getElementsCount(lock), elements_limit,
+                    desired_size, desired_elements_num,
+                    eviction_candidates.size(), stat.stat.non_releasable_count);
+
+            lock.unlock();
+            eviction_candidates.evict();
+
+            lock.lock();
+            eviction_candidates.finalize(nullptr, lock);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+            keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
+
+            /// Let's catch such cases in ci, in general there should not be exceptions.
+            chassert(false);
+        }
+    }
+}
+
 void FileCache::iterate(IterateFunc && func, const UserID & user_id)
 {
     return metadata.iterate([&](const LockedKey & locked_key)
@@ -1213,6 +1295,7 @@ void FileCache::deactivateBackgroundOperations()
 {
     shutdown.store(true);
     metadata.shutdown();
+    keep_up_free_space_ratio_task->deactivate();
 }
 
 std::vector<FileSegment::Info> FileCache::getFileSegmentInfos(const UserID & user_id)
diff --git a/src/Interpreters/Cache/FileCache.h b/src/Interpreters/Cache/FileCache.h
index 5b665ad0271..22279eccb8b 100644
--- a/src/Interpreters/Cache/FileCache.h
+++ b/src/Interpreters/Cache/FileCache.h
@@ -18,6 +18,7 @@
 #include <Interpreters/Cache/FileCache_fwd_internal.h>
 #include <Interpreters/Cache/FileCacheSettings.h>
 #include <Interpreters/Cache/UserInfo.h>
+#include <Core/BackgroundSchedulePool.h>
 #include <filesystem>
 
 
@@ -185,6 +186,8 @@ public:
 
     void applySettingsIfPossible(const FileCacheSettings & new_settings, FileCacheSettings & actual_settings);
 
+    void freeSpaceRatioKeepingThreadFunc();
+
 private:
     using KeyAndOffset = FileCacheKeyAndOffset;
 
@@ -194,6 +197,11 @@ private:
     size_t load_metadata_threads;
     const bool write_cache_per_user_directory;
 
+    BackgroundSchedulePool::TaskHolder keep_up_free_space_ratio_task;
+    const double keep_current_size_to_max_ratio;
+    const double keep_current_elements_to_max_ratio;
+    const size_t keep_up_free_space_remove_batch;
+
     LoggerPtr log;
 
     std::exception_ptr init_exception;
diff --git a/src/Interpreters/Cache/FileCacheSettings.cpp b/src/Interpreters/Cache/FileCacheSettings.cpp
index 8a48a2de68f..b7ee91a7ec2 100644
--- a/src/Interpreters/Cache/FileCacheSettings.cpp
+++ b/src/Interpreters/Cache/FileCacheSettings.cpp
@@ -78,6 +78,15 @@ void FileCacheSettings::loadImpl(FuncHas has, FuncGetUInt get_uint, FuncGetStrin
 
     if (has("write_cache_per_user_id_directory"))
         slru_size_ratio = get_uint("write_cache_per_user_id_directory");
+
+    if (has("keep_free_space_size_ratio"))
+        keep_free_space_size_ratio = get_double("keep_free_space_size_ratio");
+
+    if (has("keep_free_space_elements_ratio"))
+        keep_free_space_elements_ratio = get_double("keep_free_space_elements_ratio");
+
+    if (has("keep_free_space_remove_batch"))
+        keep_free_space_elements_ratio = get_uint("keep_free_space_remove_batch");
 }
 
 void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
diff --git a/src/Interpreters/Cache/FileCacheSettings.h b/src/Interpreters/Cache/FileCacheSettings.h
index 14770b3f005..1ba9f7e17cc 100644
--- a/src/Interpreters/Cache/FileCacheSettings.h
+++ b/src/Interpreters/Cache/FileCacheSettings.h
@@ -38,6 +38,10 @@ struct FileCacheSettings
     std::string cache_policy = "LRU";
     double slru_size_ratio = 0.5;
 
+    double keep_free_space_size_ratio = FILECACHE_DEFAULT_FREE_SPACE_SIZE_RATIO;
+    double keep_free_space_elements_ratio = FILECACHE_DEFAULT_FREE_SPACE_ELEMENTS_RATIO;
+    size_t keep_free_space_remove_batch = FILECACHE_DEFAULT_FREE_SPACE_REMOVE_BATCH;
+
     void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
     void loadFromCollection(const NamedCollection & collection);
 
diff --git a/src/Interpreters/Cache/FileCache_fwd.h b/src/Interpreters/Cache/FileCache_fwd.h
index 06261b19db7..55453b78ead 100644
--- a/src/Interpreters/Cache/FileCache_fwd.h
+++ b/src/Interpreters/Cache/FileCache_fwd.h
@@ -12,6 +12,9 @@ static constexpr int FILECACHE_DEFAULT_LOAD_METADATA_THREADS = 16;
 static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 10000000;
 static constexpr int FILECACHE_DEFAULT_HITS_THRESHOLD = 0;
 static constexpr size_t FILECACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024;
+static constexpr double FILECACHE_DEFAULT_FREE_SPACE_SIZE_RATIO = 0; /// Disabled.
+static constexpr double FILECACHE_DEFAULT_FREE_SPACE_ELEMENTS_RATIO = 0; /// Disabled.
+static constexpr int FILECACHE_DEFAULT_FREE_SPACE_REMOVE_BATCH = 10;
 
 class FileCache;
 using FileCachePtr = std::shared_ptr<FileCache>;
diff --git a/src/Interpreters/Cache/IFileCachePriority.h b/src/Interpreters/Cache/IFileCachePriority.h
index 37270cf0873..8d30e88c112 100644
--- a/src/Interpreters/Cache/IFileCachePriority.h
+++ b/src/Interpreters/Cache/IFileCachePriority.h
@@ -98,6 +98,7 @@ public:
 
     virtual PriorityDumpPtr dump(const CacheGuard::Lock &) = 0;
 
+    /// Collect eviction candidates sufficient to free `size` bytes.
     virtual bool collectCandidatesForEviction(
         size_t size,
         FileCacheReserveStat & stat,
@@ -106,6 +107,14 @@ public:
         const UserID & user_id,
         const CacheGuard::Lock &) = 0;
 
+    /// Collect eviction `candidates_num` candidates for eviction.
+    virtual EvictionCandidates collectCandidatesForEviction(
+        size_t desired_size,
+        size_t desired_elements_count,
+        size_t max_candidates_to_evict,
+        FileCacheReserveStat & stat,
+        const CacheGuard::Lock &) = 0;
+
     virtual bool modifySizeLimits(size_t max_size_, size_t max_elements_, double size_ratio_, const CacheGuard::Lock &) = 0;
 
 protected:
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.cpp b/src/Interpreters/Cache/LRUFileCachePriority.cpp
index 05bbc26e602..cd122271ebe 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/LRUFileCachePriority.cpp
@@ -230,6 +230,40 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
     if (canFit(size, lock))
         return true;
 
+    auto can_fit = [&]
+    {
+        return canFit(size, stat.stat.releasable_size, stat.stat.releasable_count, lock);
+    };
+    iterateForEviction(res, stat, can_fit, lock);
+    return can_fit();
+}
+
+EvictionCandidates LRUFileCachePriority::collectCandidatesForEviction(
+    size_t desired_size,
+    size_t desired_elements_count,
+    size_t max_candidates_to_evict,
+    FileCacheReserveStat & stat,
+    const CacheGuard::Lock & lock)
+{
+    if (!max_candidates_to_evict)
+        return {};
+
+    EvictionCandidates res;
+    auto stop_condition = [&, this]()
+    {
+        return (getSize(lock) <= desired_size && getElementsCount(lock) <= desired_elements_count)
+            || res.size() >= max_candidates_to_evict;
+    };
+    iterateForEviction(res, stat, stop_condition, lock);
+    return res;
+}
+
+void LRUFileCachePriority::iterateForEviction(
+    EvictionCandidates & res,
+    FileCacheReserveStat & stat,
+    StopConditionFunc stop_condition,
+    const CacheGuard::Lock & lock)
+{
     ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictionTries);
 
     IterateFunc iterate_func = [&](LockedKey & locked_key, const FileSegmentMetadataPtr & segment_metadata)
@@ -240,6 +274,7 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
         if (segment_metadata->evicting())
         {
             ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictionSkippedEvictingFileSegments);
+            stat.update(segment_metadata->size(), file_segment->getKind(), false);
         }
         else if (segment_metadata->releasable())
         {
@@ -248,27 +283,23 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
         }
         else
         {
-            stat.update(segment_metadata->size(), file_segment->getKind(), false);
             ProfileEvents::increment(ProfileEvents::FilesystemCacheEvictionSkippedFileSegments);
+            stat.update(segment_metadata->size(), file_segment->getKind(), false);
         }
 
         return IterationResult::CONTINUE;
     };
 
-    auto can_fit = [&]
-    {
-        return canFit(size, stat.stat.releasable_size, stat.stat.releasable_count, lock);
-    };
-
     iterate([&](LockedKey & locked_key, const FileSegmentMetadataPtr & segment_metadata)
     {
-        return can_fit() ? IterationResult::BREAK : iterate_func(locked_key, segment_metadata);
+        return stop_condition() ? IterationResult::BREAK : iterate_func(locked_key, segment_metadata);
     }, lock);
-
-    return can_fit();
 }
 
-LRUFileCachePriority::LRUIterator LRUFileCachePriority::move(LRUIterator & it, LRUFileCachePriority & other, const CacheGuard::Lock &)
+LRUFileCachePriority::LRUIterator LRUFileCachePriority::move(
+    LRUIterator & it,
+    LRUFileCachePriority & other,
+    const CacheGuard::Lock &)
 {
     const auto & entry = *it.getEntry();
     if (entry.size == 0)
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.h b/src/Interpreters/Cache/LRUFileCachePriority.h
index d8907f678a2..3e735682dc1 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.h
+++ b/src/Interpreters/Cache/LRUFileCachePriority.h
@@ -50,6 +50,13 @@ public:
         const UserID & user_id,
         const CacheGuard::Lock &) override;
 
+    EvictionCandidates collectCandidatesForEviction(
+        size_t desired_size,
+        size_t desired_elements_count,
+        size_t max_candidates_to_evict,
+        FileCacheReserveStat & stat,
+        const CacheGuard::Lock &) override;
+
     void shuffle(const CacheGuard::Lock &) override;
 
     struct LRUPriorityDump : public IPriorityDump
@@ -89,6 +96,13 @@ private:
     using IterateFunc = std::function<IterationResult(LockedKey &, const FileSegmentMetadataPtr &)>;
     void iterate(IterateFunc && func, const CacheGuard::Lock &);
 
+    using StopConditionFunc = std::function<bool()>;
+    void iterateForEviction(
+        EvictionCandidates & res,
+        FileCacheReserveStat & stat,
+        StopConditionFunc stop_condition,
+        const CacheGuard::Lock &);
+
     LRUIterator move(LRUIterator & it, LRUFileCachePriority & other, const CacheGuard::Lock &);
     LRUIterator add(EntryPtr entry, const CacheGuard::Lock &);
 };
diff --git a/src/Interpreters/Cache/SLRUFileCachePriority.cpp b/src/Interpreters/Cache/SLRUFileCachePriority.cpp
index 543d6a03669..316190d03e6 100644
--- a/src/Interpreters/Cache/SLRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/SLRUFileCachePriority.cpp
@@ -157,6 +157,37 @@ bool SLRUFileCachePriority::collectCandidatesForEviction(
     return true;
 }
 
+EvictionCandidates SLRUFileCachePriority::collectCandidatesForEviction(
+    size_t desired_size,
+    size_t desired_elements_count,
+    size_t max_candidates_to_evict,
+    FileCacheReserveStat & stat,
+    const CacheGuard::Lock & lock)
+{
+    if (!max_candidates_to_evict)
+        return {};
+
+    const auto desired_probationary_size = getRatio(desired_size, 1 - size_ratio);
+    const auto desired_probationary_elements_num = getRatio(desired_elements_count, 1 - size_ratio);
+
+    auto res = probationary_queue.collectCandidatesForEviction(
+        desired_probationary_size, desired_probationary_elements_num, max_candidates_to_evict, stat, lock);
+
+    chassert(res.size() <= max_candidates_to_evict);
+    chassert(res.size() == stat.stat.releasable_count);
+
+    if (res.size() == max_candidates_to_evict)
+        return res;
+
+    const auto desired_protected_size = getRatio(max_size, size_ratio);
+    const auto desired_protected_elements_num = getRatio(max_elements, size_ratio);
+
+    auto res_add = protected_queue.collectCandidatesForEviction(
+        desired_protected_size, desired_protected_elements_num, max_candidates_to_evict - res.size(), stat, lock);
+    res.add(res_add, lock);
+    return res;
+}
+
 void SLRUFileCachePriority::increasePriority(SLRUIterator & iterator, const CacheGuard::Lock & lock)
 {
     /// If entry is already in protected queue,
diff --git a/src/Interpreters/Cache/SLRUFileCachePriority.h b/src/Interpreters/Cache/SLRUFileCachePriority.h
index 28e61396572..eda85291cd2 100644
--- a/src/Interpreters/Cache/SLRUFileCachePriority.h
+++ b/src/Interpreters/Cache/SLRUFileCachePriority.h
@@ -47,6 +47,13 @@ public:
         const UserID & user_id,
         const CacheGuard::Lock &) override;
 
+    EvictionCandidates collectCandidatesForEviction(
+        size_t desired_size,
+        size_t desired_elements_count,
+        size_t max_candidates_to_evict,
+        FileCacheReserveStat & stat,
+        const CacheGuard::Lock &) override;
+
     void shuffle(const CacheGuard::Lock &) override;
 
     PriorityDumpPtr dump(const CacheGuard::Lock &) override;
diff --git a/tests/config/config.d/storage_conf.xml b/tests/config/config.d/storage_conf.xml
index 00d8cb3aea5..24e85031a3f 100644
--- a/tests/config/config.d/storage_conf.xml
+++ b/tests/config/config.d/storage_conf.xml
@@ -24,6 +24,8 @@
                 <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
                 <cache_policy>LRU</cache_policy>
                 <slru_size_ratio>0.3</slru_size_ratio>
+                <keep_free_space_size_ratio>0.1</keep_free_space_size_ratio>
+                <keep_free_space_elements_ratio>0.1</keep_free_space_elements_ratio>
             </s3_cache>
             <s3_cache_02933>
                 <type>cache</type>

From b0131a278bd13001f1d54413b4bc781eab8882a6 Mon Sep 17 00:00:00 2001
From: Nikita Taranov <nikita.taranov@clickhouse.com>
Date: Tue, 30 Jan 2024 21:38:07 +0100
Subject: [PATCH 013/192] impl

---
 .../MergeTree/MergeTreeDataWriter.cpp         | 27 +++++++------------
 src/Storages/MergeTree/MergeTreeDataWriter.h  |  6 +++--
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
index cadd94867ec..ce11a535e1c 100644
--- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
@@ -618,7 +618,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl(
 
         if (projection_block.rows())
         {
-            auto proj_temp_part = writeProjectionPart(data, log, projection_block, projection, new_data_part.get());
+            auto proj_temp_part
+                = writeProjectionPart(data, log, projection_block, projection, new_data_part.get(), /*merge_is_needed=*/false);
             new_data_part->addProjectionPart(projection.name, std::move(proj_temp_part.part));
             for (auto & stream : proj_temp_part.streams)
                 temp_part.streams.emplace_back(std::move(stream));
@@ -647,7 +648,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl(
     const MergeTreeData & data,
     LoggerPtr log,
     Block block,
-    const ProjectionDescription & projection)
+    const ProjectionDescription & projection,
+    bool merge_is_needed)
 {
     TemporaryPart temp_part;
     const auto & metadata_snapshot = projection.metadata;
@@ -716,7 +718,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl(
             ProfileEvents::increment(ProfileEvents::MergeTreeDataProjectionWriterBlocksAlreadySorted);
     }
 
-    if (projection.type == ProjectionDescription::Type::Aggregate)
+    if (projection.type == ProjectionDescription::Type::Aggregate && merge_is_needed)
     {
         ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::MergeTreeDataProjectionWriterMergingBlocksMicroseconds);
 
@@ -756,16 +758,11 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPart(
     LoggerPtr log,
     Block block,
     const ProjectionDescription & projection,
-    IMergeTreeDataPart * parent_part)
+    IMergeTreeDataPart * parent_part,
+    bool merge_is_needed)
 {
     return writeProjectionPartImpl(
-        projection.name,
-        false /* is_temp */,
-        parent_part,
-        data,
-        log,
-        std::move(block),
-        projection);
+        projection.name, false /* is_temp */, parent_part, data, log, std::move(block), projection, merge_is_needed);
 }
 
 /// This is used for projection materialization process which may contain multiple stages of
@@ -780,13 +777,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempProjectionPart(
 {
     auto part_name = fmt::format("{}_{}", projection.name, block_num);
     return writeProjectionPartImpl(
-        part_name,
-        true /* is_temp */,
-        parent_part,
-        data,
-        log,
-        std::move(block),
-        projection);
+        part_name, true /* is_temp */, parent_part, data, log, std::move(block), projection, /*merge_is_needed=*/true);
 }
 
 }
diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h
index 9dffea0a471..863c951d957 100644
--- a/src/Storages/MergeTree/MergeTreeDataWriter.h
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.h
@@ -95,7 +95,8 @@ public:
         LoggerPtr log,
         Block block,
         const ProjectionDescription & projection,
-        IMergeTreeDataPart * parent_part);
+        IMergeTreeDataPart * parent_part,
+        bool merge_is_needed);
 
     /// For mutation: MATERIALIZE PROJECTION.
     static TemporaryPart writeTempProjectionPart(
@@ -129,7 +130,8 @@ private:
         const MergeTreeData & data,
         LoggerPtr log,
         Block block,
-        const ProjectionDescription & projection);
+        const ProjectionDescription & projection,
+        bool merge_is_needed);
 
     MergeTreeData & data;
     LoggerPtr log;

From 02ff01f2468b36479bd40abe23138bb28a4071b6 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Tue, 19 Mar 2024 16:48:42 +0100
Subject: [PATCH 014/192] Fix build

---
 src/Interpreters/Cache/EvictionCandidates.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/Cache/EvictionCandidates.h b/src/Interpreters/Cache/EvictionCandidates.h
index 74872618543..b786480aa69 100644
--- a/src/Interpreters/Cache/EvictionCandidates.h
+++ b/src/Interpreters/Cache/EvictionCandidates.h
@@ -19,7 +19,7 @@ public:
 
     void add(const FileSegmentMetadataPtr & candidate, LockedKey & locked_key, const CachePriorityGuard::Lock &);
 
-    void add(const EvictionCandidates & other, const CacheGuard::Lock &) { candidates.insert(other.candidates.begin(), other.candidates.end()); }
+    void add(const EvictionCandidates & other, const CachePriorityGuard::Lock &) { candidates.insert(other.candidates.begin(), other.candidates.end()); }
 
     void evict();
 

From 255e4b4bda442968197148051ac507266f1708ac Mon Sep 17 00:00:00 2001
From: Nikita Taranov <nikita.taranov@clickhouse.com>
Date: Mon, 25 Mar 2024 14:22:39 +0000
Subject: [PATCH 015/192] fix test

---
 .../queries/0_stateless/02982_perf_introspection_for_inserts.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02982_perf_introspection_for_inserts.sh b/tests/queries/0_stateless/02982_perf_introspection_for_inserts.sh
index f5fb54b54d3..409bd996cbd 100755
--- a/tests/queries/0_stateless/02982_perf_introspection_for_inserts.sh
+++ b/tests/queries/0_stateless/02982_perf_introspection_for_inserts.sh
@@ -36,7 +36,7 @@ FROM numbers_mt(1000000);
 $CLICKHOUSE_CLIENT -q "SYSTEM FLUSH LOGS"
 $CLICKHOUSE_CLIENT -q """
 SELECT
-    ProfileEvents['MergeTreeDataProjectionWriterMergingBlocksMicroseconds'] > 0,
+    ProfileEvents['MergeTreeDataProjectionWriterMergingBlocksMicroseconds'] = 0,
     ProfileEvents['MergeTreeDataProjectionWriterSortingBlocksMicroseconds'] > 0,
     ProfileEvents['MergeTreeDataWriterSortingBlocksMicroseconds'] > 0,
     ProfileEvents['MergeTreeDataWriterProjectionsCalculationMicroseconds'] > 0,

From 4bb827e987781c9a754b1c7c5eb1f46acd28a90a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B1=AA=E8=82=A5=E8=82=A5?= <howepa@qq.com>
Date: Thu, 28 Mar 2024 14:28:42 +0800
Subject: [PATCH 016/192] fix npy big endianness

---
 src/Processors/Formats/Impl/NpyRowInputFormat.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Processors/Formats/Impl/NpyRowInputFormat.cpp b/src/Processors/Formats/Impl/NpyRowInputFormat.cpp
index 795ad12ac98..65e0f9dd192 100644
--- a/src/Processors/Formats/Impl/NpyRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/NpyRowInputFormat.cpp
@@ -131,7 +131,7 @@ std::shared_ptr<NumpyDataType> parseType(String type)
     NumpyDataType::Endianness endianness;
     if (type[0] == '<')
         endianness = NumpyDataType::Endianness::LITTLE;
-    else if (type[1] == '>')
+    else if (type[0] == '>')
         endianness = NumpyDataType::Endianness::BIG;
     else if (type[0] == '|')
         endianness = NumpyDataType::Endianness::NONE;

From fd925770b2de0ea067541a156c796552a94f3d11 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Thu, 28 Mar 2024 17:00:54 +0100
Subject: [PATCH 017/192] Fixes after merge

---
 src/Interpreters/Cache/EvictionCandidates.h   |  6 +-
 src/Interpreters/Cache/FileCache.cpp          |  5 +-
 .../Cache/LRUFileCachePriority.cpp            | 72 +++++++++----------
 .../Cache/SLRUFileCachePriority.cpp           |  2 +-
 4 files changed, 43 insertions(+), 42 deletions(-)

diff --git a/src/Interpreters/Cache/EvictionCandidates.h b/src/Interpreters/Cache/EvictionCandidates.h
index 7468c159dae..b2bb6ae9e9f 100644
--- a/src/Interpreters/Cache/EvictionCandidates.h
+++ b/src/Interpreters/Cache/EvictionCandidates.h
@@ -7,15 +7,17 @@ namespace DB
 class EvictionCandidates
 {
 public:
+    using FinalizeEvictionFunc = std::function<void(const CachePriorityGuard::Lock & lk)>;
+
     EvictionCandidates() = default;
     EvictionCandidates(EvictionCandidates && other) noexcept
     {
         candidates = std::move(other.candidates);
         candidates_size = std::move(other.candidates_size);
+        on_finalize = std::move(other.on_finalize);
         queue_entries_to_invalidate = std::move(other.queue_entries_to_invalidate);
-        finalize_eviction_func = std::move(other.finalize_eviction_func);
+        hold_space = std::move(other.hold_space);
     }
-    using FinalizeEvictionFunc = std::function<void(const CachePriorityGuard::Lock & lk)>;
 
     ~EvictionCandidates();
 
diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 8dfad6b8edf..455d11a3c1b 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -997,7 +997,7 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
                     main_priority->getSize(lock), size_limit,
                     main_priority->getElementsCount(lock), elements_limit,
                     desired_size, desired_elements_num,
-                    eviction_candidates.size(), stat.stat.non_releasable_count);
+                    eviction_candidates.size(), stat.total_stat.non_releasable_count);
 
             lock.unlock();
             eviction_candidates.evict();
@@ -1345,7 +1345,8 @@ void FileCache::deactivateBackgroundOperations()
 {
     shutdown.store(true);
     metadata.shutdown();
-    keep_up_free_space_ratio_task->deactivate();
+    if (keep_up_free_space_ratio_task)
+        keep_up_free_space_ratio_task->deactivate();
 }
 
 std::vector<FileSegment::Info> FileCache::getFileSegmentInfos(const UserID & user_id)
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.cpp b/src/Interpreters/Cache/LRUFileCachePriority.cpp
index 4e375813727..3780cd1750f 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/LRUFileCachePriority.cpp
@@ -275,10 +275,43 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
 
     auto can_fit = [&]
     {
-        return canFit(size, 1, stat.stat.releasable_size, stat.stat.releasable_count, nullptr, nullptr, lock);
+        return canFit(size, elements, stat.total_stat.releasable_size, stat.total_stat.releasable_count, lock);
     };
+
     iterateForEviction(res, stat, can_fit, lock);
-    return can_fit();
+
+    if (can_fit())
+    {
+        /// As eviction is done without a cache priority lock,
+        /// then if some space was partially available and some needed
+        /// to be freed via eviction, we need to make sure that this
+        /// partially available space is still available
+        /// after we finish with eviction for non-available space.
+        /// So we create a space holder for the currently available part
+        /// of the required space for the duration of eviction of the other
+        /// currently non-available part of the space.
+
+        const size_t hold_size = size > stat.total_stat.releasable_size
+            ? size - stat.total_stat.releasable_size
+            : 0;
+
+        const size_t hold_elements = elements > stat.total_stat.releasable_count
+            ? elements - stat.total_stat.releasable_count
+            : 0;
+
+        if (hold_size || hold_elements)
+            res.setSpaceHolder(hold_size, hold_elements, *this, lock);
+
+        // LOG_TEST(log, "Collected {} candidates for eviction (total size: {}). "
+        //          "Took hold of size {} and elements {}",
+        //          res.size(), stat.total_stat.releasable_size, hold_size, hold_elements);
+
+        return true;
+    }
+    else
+    {
+        return false;
+    }
 }
 
 EvictionCandidates LRUFileCachePriority::collectCandidatesForEviction(
@@ -334,41 +367,6 @@ void LRUFileCachePriority::iterateForEviction(
     }, lock);
 }
 
-
-    if (can_fit())
-    {
-        /// As eviction is done without a cache priority lock,
-        /// then if some space was partially available and some needed
-        /// to be freed via eviction, we need to make sure that this
-        /// partially available space is still available
-        /// after we finish with eviction for non-available space.
-        /// So we create a space holder for the currently available part
-        /// of the required space for the duration of eviction of the other
-        /// currently non-available part of the space.
-
-        const size_t hold_size = size > stat.total_stat.releasable_size
-            ? size - stat.total_stat.releasable_size
-            : 0;
-
-        const size_t hold_elements = elements > stat.total_stat.releasable_count
-            ? elements - stat.total_stat.releasable_count
-            : 0;
-
-        if (hold_size || hold_elements)
-            res.setSpaceHolder(hold_size, hold_elements, *this, lock);
-
-        // LOG_TEST(log, "Collected {} candidates for eviction (total size: {}). "
-        //          "Took hold of size {} and elements {}",
-        //          res.size(), stat.total_stat.releasable_size, hold_size, hold_elements);
-
-        return true;
-    }
-    else
-    {
-        return false;
-    }
-}
-
 LRUFileCachePriority::LRUIterator LRUFileCachePriority::move(
     LRUIterator & it,
     LRUFileCachePriority & other,
diff --git a/src/Interpreters/Cache/SLRUFileCachePriority.cpp b/src/Interpreters/Cache/SLRUFileCachePriority.cpp
index 1503c23337c..96fd4185fbb 100644
--- a/src/Interpreters/Cache/SLRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/SLRUFileCachePriority.cpp
@@ -266,7 +266,7 @@ EvictionCandidates SLRUFileCachePriority::collectCandidatesForEviction(
         desired_probationary_size, desired_probationary_elements_num, max_candidates_to_evict, stat, lock);
 
     chassert(res.size() <= max_candidates_to_evict);
-    chassert(res.size() == stat.stat.releasable_count);
+    chassert(res.size() == stat.total_stat.releasable_count);
 
     if (res.size() == max_candidates_to_evict)
         return res;

From acd8a1f32e6640db767fb44e675c24cce4ea564c Mon Sep 17 00:00:00 2001
From: Michael Kolupaev <michael.kolupaev@clickhouse.com>
Date: Fri, 29 Mar 2024 09:15:45 +0000
Subject: [PATCH 018/192] Enable custom parquet encoder by default

---
 src/Core/Settings.h               | 2 +-
 src/Core/SettingsChangesHistory.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 84e709294aa..f4e34aeea3c 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -1111,7 +1111,7 @@ class IColumn;
     M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \
     M(ParquetCompression, output_format_parquet_compression_method, "zstd", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \
     M(Bool, output_format_parquet_compliant_nested_types, true, "In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow.", 0) \
-    M(Bool, output_format_parquet_use_custom_encoder, false, "Use a faster Parquet encoder implementation.", 0) \
+    M(Bool, output_format_parquet_use_custom_encoder, true, "Use a faster Parquet encoder implementation.", 0) \
     M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \
     M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \
     M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index 170836cb980..7ea721ec955 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -125,6 +125,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."},
               {"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."},
               {"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."},
+              {"output_format_parquet_use_custom_encoder", false, true, "Enable custom Parquet encoder."},
               }},
     {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
               {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},

From 1cd73b907c1493729699578231fa223f57de2f35 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Fri, 5 Apr 2024 12:59:57 +0200
Subject: [PATCH 019/192] Better

---
 src/Interpreters/Cache/EvictionCandidates.h   |  11 --
 src/Interpreters/Cache/FileCache.cpp          | 123 ++++++++++++------
 src/Interpreters/Cache/IFileCachePriority.h   |   3 +-
 .../Cache/LRUFileCachePriority.cpp            |   6 +-
 src/Interpreters/Cache/LRUFileCachePriority.h |   3 +-
 .../Cache/SLRUFileCachePriority.cpp           |  33 +++--
 .../Cache/SLRUFileCachePriority.h             |   3 +-
 tests/config/config.d/storage_conf.xml        |   6 +-
 8 files changed, 116 insertions(+), 72 deletions(-)

diff --git a/src/Interpreters/Cache/EvictionCandidates.h b/src/Interpreters/Cache/EvictionCandidates.h
index b2bb6ae9e9f..db062561ad7 100644
--- a/src/Interpreters/Cache/EvictionCandidates.h
+++ b/src/Interpreters/Cache/EvictionCandidates.h
@@ -10,15 +10,6 @@ public:
     using FinalizeEvictionFunc = std::function<void(const CachePriorityGuard::Lock & lk)>;
 
     EvictionCandidates() = default;
-    EvictionCandidates(EvictionCandidates && other) noexcept
-    {
-        candidates = std::move(other.candidates);
-        candidates_size = std::move(other.candidates_size);
-        on_finalize = std::move(other.on_finalize);
-        queue_entries_to_invalidate = std::move(other.queue_entries_to_invalidate);
-        hold_space = std::move(other.hold_space);
-    }
-
     ~EvictionCandidates();
 
     void add(
@@ -26,8 +17,6 @@ public:
         LockedKey & locked_key,
         const CachePriorityGuard::Lock &);
 
-    void add(const EvictionCandidates & other, const CachePriorityGuard::Lock &) { candidates.insert(other.candidates.begin(), other.candidates.end()); }
-
     void evict();
 
     void onFinalize(FinalizeEvictionFunc && func) { on_finalize.emplace_back(std::move(func)); }
diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 3f16a8dde4e..61401c8ea0a 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -953,49 +953,71 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
     static constexpr auto space_ratio_satisfied_reschedule_ms = 5000;
     static constexpr auto general_reschedule_ms = 5000;
 
-    while (true)
+    if (shutdown)
+        return;
+
+    Stopwatch watch;
+
+    auto lock = tryLockCache();
+
+    /// To avoid deteriorating contention on cache,
+    /// proceed only if cache is not heavily used.
+    if (!lock)
     {
+        keep_up_free_space_ratio_task->scheduleAfter(lock_failed_reschedule_ms);
+        return;
+    }
+
+    const size_t size_limit = main_priority->getSizeLimit(lock);
+    const size_t elements_limit = main_priority->getElementsLimit(lock);
+
+    const size_t desired_size = std::lround(keep_current_size_to_max_ratio * size_limit);
+    const size_t desired_elements_num = std::lround(keep_current_elements_to_max_ratio * elements_limit);
+
+    if ((size_limit == 0 || main_priority->getSize(lock) <= desired_size)
+        && (elements_limit == 0 || main_priority->getElementsCount(lock) <= desired_elements_num))
+    {
+        /// Nothing to free - all limits are satisfied.
+        keep_up_free_space_ratio_task->scheduleAfter(space_ratio_satisfied_reschedule_ms);
+        return;
+    }
+
+    FileCacheReserveStat stat;
+    EvictionCandidates eviction_candidates;
+
+    try
+    {
+        /// Collect at most `keep_up_free_space_remove_batch` elements to evict,
+        /// (we use batches to make sure we do not block cache for too long,
+        /// by default the batch size is quite small).
+        const bool limits_satisfied = main_priority->collectCandidatesForEviction(
+            desired_size, desired_elements_num, keep_up_free_space_remove_batch, stat, eviction_candidates, lock);
+
+#ifdef ABORT_ON_LOGICAL_ERROR
+        /// Let's make sure that we correctly processed the limits.
+        if (limits_satisfied && eviction_candidates.size() < keep_up_free_space_remove_batch)
+        {
+            const auto current_size = main_priority->getSize(lock);
+            chassert(current_size >= stat.total_stat.releasable_size);
+            chassert(!size_limit
+                        || current_size <= desired_size
+                        || current_size - stat.total_stat.releasable_size <= desired_size);
+
+            const auto current_elements_count = main_priority->getElementsCount(lock);
+            chassert(current_elements_count >= stat.total_stat.releasable_count);
+            chassert(!elements_limit
+                        || current_elements_count <= desired_elements_num
+                        || current_elements_count - stat.total_stat.releasable_count <= desired_elements_num);
+        }
+#else
+        UNUSED(limits_satisfied);
+#endif
+
         if (shutdown)
             return;
 
-        auto lock = tryLockCache();
-        if (!lock)
+        if (eviction_candidates.size() > 0)
         {
-            keep_up_free_space_ratio_task->scheduleAfter(lock_failed_reschedule_ms);
-            return;
-        }
-
-        const size_t size_limit = main_priority->getSizeLimit(lock);
-        const size_t elements_limit = main_priority->getElementsLimit(lock);
-
-        const size_t desired_size = std::lround(keep_current_size_to_max_ratio * size_limit);
-        const size_t desired_elements_num = std::lround(keep_current_elements_to_max_ratio * elements_limit);
-
-        if ((size_limit == 0 || main_priority->getSize(lock) <= desired_size)
-            && (elements_limit == 0 || main_priority->getElementsCount(lock) <= desired_elements_num))
-        {
-            /// Nothing to free - all limits are satisfied.
-            keep_up_free_space_ratio_task->scheduleAfter(space_ratio_satisfied_reschedule_ms);
-            return;
-        }
-
-        try
-        {
-            FileCacheReserveStat stat;
-            auto eviction_candidates = main_priority->collectCandidatesForEviction(
-                desired_size, desired_elements_num, keep_up_free_space_remove_batch, stat, lock);
-
-            if (shutdown)
-                return;
-
-            if (eviction_candidates.size() == 0)
-            {
-                /// This case is impossible in realistic cache setup,
-                /// e.g. we should always be able to evict something.
-                keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
-                return;
-            }
-
             LOG_TRACE(log, "Current usage {}/{} in size, {}/{} in elements count "
                     "(trying to keep size ration at {} and elements ratio at {}). "
                     "Collected {} eviction candidates, "
@@ -1006,20 +1028,35 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
                     eviction_candidates.size(), stat.total_stat.non_releasable_count);
 
             lock.unlock();
+
+            /// Remove files from filesystem.
             eviction_candidates.evict();
 
+            /// Take lock again to finalize eviction,
+            /// e.g. to update the in-memory state.
             lock.lock();
             eviction_candidates.finalize(nullptr, lock);
         }
-        catch (...)
+        else
         {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
             keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
-
-            /// Let's catch such cases in ci, in general there should not be exceptions.
-            chassert(false);
         }
     }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+
+        if (eviction_candidates.size() > 0)
+            eviction_candidates.finalize(nullptr, lockCache());
+
+        keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
+
+        /// Let's catch such cases in ci,
+        /// in general there should not be exceptions.
+        chassert(false);
+    }
+
+    LOG_TRACE(log, "Free space ratio keeping thread finished in {} ms", watch.elapsedMilliseconds());
 }
 
 void FileCache::iterate(IterateFunc && func, const UserID & user_id)
diff --git a/src/Interpreters/Cache/IFileCachePriority.h b/src/Interpreters/Cache/IFileCachePriority.h
index dba925ba37b..e1ac6c6661d 100644
--- a/src/Interpreters/Cache/IFileCachePriority.h
+++ b/src/Interpreters/Cache/IFileCachePriority.h
@@ -148,11 +148,12 @@ public:
         const CachePriorityGuard::Lock &) = 0;
 
     /// Collect eviction `candidates_num` candidates for eviction.
-    virtual EvictionCandidates collectCandidatesForEviction(
+    virtual bool collectCandidatesForEviction(
         size_t desired_size,
         size_t desired_elements_count,
         size_t max_candidates_to_evict,
         FileCacheReserveStat & stat,
+        EvictionCandidates & candidates,
         const CachePriorityGuard::Lock &) = 0;
 
     virtual bool modifySizeLimits(size_t max_size_, size_t max_elements_, double size_ratio_, const CachePriorityGuard::Lock &) = 0;
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.cpp b/src/Interpreters/Cache/LRUFileCachePriority.cpp
index 203fc680b47..8ac60cd9e6e 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/LRUFileCachePriority.cpp
@@ -322,24 +322,24 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
     }
 }
 
-EvictionCandidates LRUFileCachePriority::collectCandidatesForEviction(
+bool LRUFileCachePriority::collectCandidatesForEviction(
     size_t desired_size,
     size_t desired_elements_count,
     size_t max_candidates_to_evict,
     FileCacheReserveStat & stat,
+    EvictionCandidates & res,
     const CachePriorityGuard::Lock & lock)
 {
     if (!max_candidates_to_evict)
         return {};
 
-    EvictionCandidates res;
     auto stop_condition = [&, this]()
     {
         return (getSize(lock) <= desired_size && getElementsCount(lock) <= desired_elements_count)
             || res.size() >= max_candidates_to_evict;
     };
     iterateForEviction(res, stat, stop_condition, lock);
-    return res;
+    return stop_condition();
 }
 
 void LRUFileCachePriority::iterateForEviction(
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.h b/src/Interpreters/Cache/LRUFileCachePriority.h
index d05c67fe5d0..4cc80f07664 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.h
+++ b/src/Interpreters/Cache/LRUFileCachePriority.h
@@ -62,11 +62,12 @@ public:
         const UserID & user_id,
         const CachePriorityGuard::Lock &) override;
 
-    EvictionCandidates collectCandidatesForEviction(
+    bool collectCandidatesForEviction(
         size_t desired_size,
         size_t desired_elements_count,
         size_t max_candidates_to_evict,
         FileCacheReserveStat & stat,
+        EvictionCandidates & res,
         const CachePriorityGuard::Lock &) override;
 
     void shuffle(const CachePriorityGuard::Lock &) override;
diff --git a/src/Interpreters/Cache/SLRUFileCachePriority.cpp b/src/Interpreters/Cache/SLRUFileCachePriority.cpp
index a2730d88abb..4b6221d972a 100644
--- a/src/Interpreters/Cache/SLRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/SLRUFileCachePriority.cpp
@@ -251,11 +251,12 @@ bool SLRUFileCachePriority::collectCandidatesForEvictionInProtected(
     return true;
 }
 
-EvictionCandidates SLRUFileCachePriority::collectCandidatesForEviction(
+bool SLRUFileCachePriority::collectCandidatesForEviction(
     size_t desired_size,
     size_t desired_elements_count,
     size_t max_candidates_to_evict,
     FileCacheReserveStat & stat,
+    EvictionCandidates & res,
     const CachePriorityGuard::Lock & lock)
 {
     if (!max_candidates_to_evict)
@@ -264,22 +265,36 @@ EvictionCandidates SLRUFileCachePriority::collectCandidatesForEviction(
     const auto desired_probationary_size = getRatio(desired_size, 1 - size_ratio);
     const auto desired_probationary_elements_num = getRatio(desired_elements_count, 1 - size_ratio);
 
-    auto res = probationary_queue.collectCandidatesForEviction(
-        desired_probationary_size, desired_probationary_elements_num, max_candidates_to_evict, stat, lock);
+    FileCacheReserveStat probationary_stat;
+    const bool probationary_limit_satisfied = probationary_queue.collectCandidatesForEviction(
+        desired_probationary_size, desired_probationary_elements_num,
+        max_candidates_to_evict, probationary_stat, res, lock);
+
+    stat += probationary_stat;
+
+    LOG_TEST(log, "Collected {} to evict from probationary queue. Total size: {}",
+             res.size(), probationary_stat.total_stat.releasable_size);
 
     chassert(res.size() <= max_candidates_to_evict);
     chassert(res.size() == stat.total_stat.releasable_count);
 
-    if (res.size() == max_candidates_to_evict)
-        return res;
+    if (res.size() >= max_candidates_to_evict)
+        return probationary_limit_satisfied;
 
     const auto desired_protected_size = getRatio(max_size, size_ratio);
     const auto desired_protected_elements_num = getRatio(max_elements, size_ratio);
 
-    auto res_add = protected_queue.collectCandidatesForEviction(
-        desired_protected_size, desired_protected_elements_num, max_candidates_to_evict - res.size(), stat, lock);
-    res.add(res_add, lock);
-    return res;
+    FileCacheReserveStat protected_stat;
+    const bool protected_limit_satisfied = protected_queue.collectCandidatesForEviction(
+        desired_protected_size, desired_protected_elements_num,
+        max_candidates_to_evict - res.size(), protected_stat, res, lock);
+
+    stat += protected_stat;
+
+    LOG_TEST(log, "Collected {} to evict from protected queue. Total size: {}",
+             res.size(), protected_stat.total_stat.releasable_size);
+
+    return probationary_limit_satisfied && protected_limit_satisfied;
 }
 
 void SLRUFileCachePriority::downgrade(IteratorPtr iterator, const CachePriorityGuard::Lock & lock)
diff --git a/src/Interpreters/Cache/SLRUFileCachePriority.h b/src/Interpreters/Cache/SLRUFileCachePriority.h
index e837b8541c2..ee3cafe322d 100644
--- a/src/Interpreters/Cache/SLRUFileCachePriority.h
+++ b/src/Interpreters/Cache/SLRUFileCachePriority.h
@@ -58,11 +58,12 @@ public:
         const UserID & user_id,
         const CachePriorityGuard::Lock &) override;
 
-    EvictionCandidates collectCandidatesForEviction(
+    bool collectCandidatesForEviction(
         size_t desired_size,
         size_t desired_elements_count,
         size_t max_candidates_to_evict,
         FileCacheReserveStat & stat,
+        EvictionCandidates & res,
         const CachePriorityGuard::Lock &) override;
 
     void shuffle(const CachePriorityGuard::Lock &) override;
diff --git a/tests/config/config.d/storage_conf.xml b/tests/config/config.d/storage_conf.xml
index 24e85031a3f..9d1f7e6b474 100644
--- a/tests/config/config.d/storage_conf.xml
+++ b/tests/config/config.d/storage_conf.xml
@@ -19,13 +19,13 @@
                 <type>cache</type>
                 <disk>s3_disk</disk>
                 <path>s3_cache/</path>
-                <max_size>64Mi</max_size>
+                <max_size>100Mi</max_size>
                 <cache_on_write_operations>1</cache_on_write_operations>
                 <delayed_cleanup_interval_ms>100</delayed_cleanup_interval_ms>
                 <cache_policy>LRU</cache_policy>
                 <slru_size_ratio>0.3</slru_size_ratio>
-                <keep_free_space_size_ratio>0.1</keep_free_space_size_ratio>
-                <keep_free_space_elements_ratio>0.1</keep_free_space_elements_ratio>
+                <keep_free_space_size_ratio>0.15</keep_free_space_size_ratio>
+                <keep_free_space_elements_ratio>0.15</keep_free_space_elements_ratio>
             </s3_cache>
             <s3_cache_02933>
                 <type>cache</type>

From 832c7087a7c0cf0692140d349cd52b44278454d3 Mon Sep 17 00:00:00 2001
From: zhongyuankai <872237106@qq.com>
Date: Sat, 6 Apr 2024 10:31:45 +0800
Subject: [PATCH 020/192] optimize drop tables

---
 src/Interpreters/DDLTask.cpp                       |  9 ---------
 src/Interpreters/InterpreterDropQuery.cpp          |  2 +-
 src/Parsers/ASTDropQuery.h                         |  3 ++-
 src/Parsers/ParserDropQuery.cpp                    | 11 +++++++++++
 src/Parsers/tests/gtest_dictionary_parser.cpp      | 12 ++++--------
 .../0_stateless/02961_drop_tables.reference        |  3 +++
 tests/queries/0_stateless/02961_drop_tables.sql    | 14 ++++++++++++++
 7 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp
index 37954850851..a37b4db029a 100644
--- a/src/Interpreters/DDLTask.cpp
+++ b/src/Interpreters/DDLTask.cpp
@@ -16,7 +16,6 @@
 #include <Parsers/parseQuery.h>
 #include <Parsers/queryToString.h>
 #include <Parsers/ASTQueryWithTableAndOutput.h>
-#include <Parsers/ASTDropQuery.h>
 
 
 namespace DB
@@ -201,14 +200,6 @@ void DDLTaskBase::parseQueryFromEntry(ContextPtr context)
     ParserQuery parser_query(end, settings.allow_settings_after_format_in_insert);
     String description;
     query = parseQuery(parser_query, begin, end, description, 0, settings.max_parser_depth, settings.max_parser_backtracks);
-    if (auto * query_drop = query->as<ASTDropQuery>())
-    {
-        ASTs drops = query_drop->getRewrittenASTsOfSingleTable();
-        if (drops.size() > 1)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Not supports drop multiple tables for ddl task.");
-
-        query = drops[0];
-    }
 }
 
 void DDLTaskBase::formatRewrittenQuery(ContextPtr context)
diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp
index e29e59ee4c3..0e591a7782a 100644
--- a/src/Interpreters/InterpreterDropQuery.cpp
+++ b/src/Interpreters/InterpreterDropQuery.cpp
@@ -557,7 +557,7 @@ bool InterpreterDropQuery::supportsTransactions() const
     return drop.cluster.empty()
             && !drop.temporary
             && drop.kind == ASTDropQuery::Kind::Truncate
-            && drop.database_and_tables;
+            && drop.table;
 }
 
 void registerInterpreterDropQuery(InterpreterFactory & factory)
diff --git a/src/Parsers/ASTDropQuery.h b/src/Parsers/ASTDropQuery.h
index e0e908733e5..7e25e990bc8 100644
--- a/src/Parsers/ASTDropQuery.h
+++ b/src/Parsers/ASTDropQuery.h
@@ -40,7 +40,7 @@ public:
     // We detach the object permanently, so it will not be reattached back during server restart.
     bool permanently{false};
 
-    /// Example: Drop TABLE t1, t2, t3...
+    /// Used to drop multiple tables only, example: Drop TABLE t1, t2, t3...
     ASTPtr database_and_tables;
 
     /** Get the text that identifies this element. */
@@ -52,6 +52,7 @@ public:
         return removeOnCluster<ASTDropQuery>(clone(), params.default_database);
     }
 
+    /** Convert an AST that deletes multiple tables into multiple ASTs that delete a single table. */
     ASTs getRewrittenASTsOfSingleTable();
 
     QueryKind getQueryKind() const override { return QueryKind::Drop; }
diff --git a/src/Parsers/ParserDropQuery.cpp b/src/Parsers/ParserDropQuery.cpp
index 09f15e9649f..9fe8306c0c2 100644
--- a/src/Parsers/ParserDropQuery.cpp
+++ b/src/Parsers/ParserDropQuery.cpp
@@ -7,6 +7,11 @@
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int SYNTAX_ERROR;
+}
+
 namespace
 {
 
@@ -84,6 +89,9 @@ bool parseDropQuery(IParser::Pos & pos, ASTPtr & node, Expected & expected, cons
 
         if (!tables_p.parse(pos, database_and_tables, expected))
             return false;
+
+        if (database_and_tables->size() > 1 && kind != ASTDropQuery::Kind::Drop)
+            throw Exception(ErrorCodes::SYNTAX_ERROR, "Only Support DROP multiple tables currently");
     }
 
     /// common for tables / dictionaries / databases
@@ -123,6 +131,9 @@ bool parseDropQuery(IParser::Pos & pos, ASTPtr & node, Expected & expected, cons
 
     query->cluster = cluster_str;
 
+    if (database_and_tables && database_and_tables->size() == 1)
+        node = query->getRewrittenASTsOfSingleTable()[0];
+
     return true;
 }
 
diff --git a/src/Parsers/tests/gtest_dictionary_parser.cpp b/src/Parsers/tests/gtest_dictionary_parser.cpp
index 44205975cdc..a1ba46125a7 100644
--- a/src/Parsers/tests/gtest_dictionary_parser.cpp
+++ b/src/Parsers/tests/gtest_dictionary_parser.cpp
@@ -301,10 +301,8 @@ TEST(ParserDictionaryDDL, ParseDropQuery)
     ASTDropQuery * drop1 = ast1->as<ASTDropQuery>();
 
     EXPECT_TRUE(drop1->is_dictionary);
-    auto & database_and_tables1 = drop1->database_and_tables->as<ASTExpressionList &>();
-    auto identifier1 = dynamic_pointer_cast<ASTTableIdentifier>(database_and_tables1.children[0]);
-    EXPECT_EQ(identifier1->getDatabaseName(), "test");
-    EXPECT_EQ(identifier1->shortName(), "dict1");
+    EXPECT_EQ(drop1->getDatabase(), "test");
+    EXPECT_EQ(drop1->getTable(), "dict1");
     auto str1 = serializeAST(*drop1);
     EXPECT_EQ(input1, str1);
 
@@ -314,10 +312,8 @@ TEST(ParserDictionaryDDL, ParseDropQuery)
     ASTDropQuery * drop2 = ast2->as<ASTDropQuery>();
 
     EXPECT_TRUE(drop2->is_dictionary);
-    auto & database_and_tables2 = drop2->database_and_tables->as<ASTExpressionList &>();
-    auto identifier2 = dynamic_pointer_cast<ASTTableIdentifier>(database_and_tables2.children[0]);
-    EXPECT_EQ(identifier2->getDatabaseName(), "");
-    EXPECT_EQ(identifier2->shortName(), "dict2");
+    EXPECT_EQ(drop2->getDatabase(), "");
+    EXPECT_EQ(drop2->getTable(), "dict2");
     auto str2 = serializeAST(*drop2);
     EXPECT_EQ(input2, str2);
 }
diff --git a/tests/queries/0_stateless/02961_drop_tables.reference b/tests/queries/0_stateless/02961_drop_tables.reference
index c0465dc592a..8ccdec0a1b0 100644
--- a/tests/queries/0_stateless/02961_drop_tables.reference
+++ b/tests/queries/0_stateless/02961_drop_tables.reference
@@ -6,3 +6,6 @@ Test when deletion of existing table fails
 -- check which tables exist in 02961_db1
 -- check which tables exist in 02961_db2
 02961_tb5
+Test when deletion of not empty table fails
+tab2
+tab3
diff --git a/tests/queries/0_stateless/02961_drop_tables.sql b/tests/queries/0_stateless/02961_drop_tables.sql
index e91ac4bfe19..f84fffbef75 100644
--- a/tests/queries/0_stateless/02961_drop_tables.sql
+++ b/tests/queries/0_stateless/02961_drop_tables.sql
@@ -27,6 +27,20 @@ SHOW TABLES FROM 02961_db1;
 SELECT '-- check which tables exist in 02961_db2';
 SHOW TABLES FROM 02961_db2;
 
+DROP TABLE IF EXISTS tab1, tab2, tab3;
+CREATE TABLE IF NOT EXISTS tab1 (id UInt32) Engine=Memory();
+CREATE TABLE IF NOT EXISTS tab2 (id UInt32) Engine=Memory();
+CREATE TABLE IF NOT EXISTS tab3 (id UInt32) Engine=Memory();
+
+INSERT INTO tab2 SELECT number FROM system.numbers limit 10;
+
+DROP TABLE IF EMPTY tab1, tab2, tab3; -- { serverError TABLE_NOT_EMPTY }
+SELECT 'Test when deletion of not empty table fails';
+SHOW TABLES;
+
+TRUNCATE TABLE tab2, tab3; -- { clientError SYNTAX_ERROR }
+
+DROP TABLE IF EXISTS tab1, tab2, tab3;
 
 DROP DATABASE IF EXISTS 02961_db1;
 DROP DATABASE IF EXISTS 02961_db2;

From d5aa23659043ae17a267bef3353a383e5383a24e Mon Sep 17 00:00:00 2001
From: zhongyuankai <54787696+zhongyuankai@users.noreply.github.com>
Date: Mon, 8 Apr 2024 19:40:27 +0800
Subject: [PATCH 021/192] fix test

---
 src/Parsers/ParserDropQuery.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Parsers/ParserDropQuery.cpp b/src/Parsers/ParserDropQuery.cpp
index 9fe8306c0c2..6efafa112d3 100644
--- a/src/Parsers/ParserDropQuery.cpp
+++ b/src/Parsers/ParserDropQuery.cpp
@@ -90,7 +90,7 @@ bool parseDropQuery(IParser::Pos & pos, ASTPtr & node, Expected & expected, cons
         if (!tables_p.parse(pos, database_and_tables, expected))
             return false;
 
-        if (database_and_tables->size() > 1 && kind != ASTDropQuery::Kind::Drop)
+        if (database_and_tables->as<ASTExpressionList &>().children.size() > 1 && kind != ASTDropQuery::Kind::Drop)
             throw Exception(ErrorCodes::SYNTAX_ERROR, "Only Support DROP multiple tables currently");
     }
 
@@ -131,7 +131,7 @@ bool parseDropQuery(IParser::Pos & pos, ASTPtr & node, Expected & expected, cons
 
     query->cluster = cluster_str;
 
-    if (database_and_tables && database_and_tables->size() == 1)
+    if (database_and_tables && database_and_tables->as<ASTExpressionList &>().children.size() == 1)
         node = query->getRewrittenASTsOfSingleTable()[0];
 
     return true;

From 80c494a900474be5208e7be82793de86171408b3 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Mon, 8 Apr 2024 15:23:41 +0200
Subject: [PATCH 022/192] Fxi

---
 src/Interpreters/Cache/FileCache.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 61401c8ea0a..83e17ddb2bb 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -193,6 +193,8 @@ void FileCache::initialize()
     if (keep_current_size_to_max_ratio != 1 || keep_current_elements_to_max_ratio != 1)
         keep_up_free_space_ratio_task = Context::getGlobalContextInstance()->getSchedulePool().createTask(log->name(), [this] { freeSpaceRatioKeepingThreadFunc(); });
 
+    keep_up_free_space_ratio_task->schedule();
+
     is_initialized = true;
 }
 

From 7404e7d27bf6ecadba46f97395a60d48894c1374 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Mon, 8 Apr 2024 15:24:00 +0200
Subject: [PATCH 023/192] Add a test

---
 .../integration/test_filesystem_cache/test.py | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/integration/test_filesystem_cache/test.py b/tests/integration/test_filesystem_cache/test.py
index dfab462732a..17a8dd8b6e1 100644
--- a/tests/integration/test_filesystem_cache/test.py
+++ b/tests/integration/test_filesystem_cache/test.py
@@ -501,3 +501,60 @@ INSERT INTO test SELECT 1, 'test';
     node.query("SELECT * FROM test FORMAT Null")
 
     assert key not in node.query("SYSTEM SYNC FILESYSTEM CACHE")
+
+
+def test_keep_up_size_ratio(cluster):
+    node = cluster.instances["node"]
+    max_elements = 20
+    elements_ratio = 0.5
+    cache_name = "keep_up_size_ratio"
+    node.query(
+        f"""
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test (a String)
+ENGINE = MergeTree() ORDER BY tuple()
+SETTINGS disk = disk(type = cache,
+            name = {cache_name},
+            max_size = '100Ki',
+            max_elements = {max_elements},
+            max_file_segment_size = 10,
+            boundary_alignment = 10,
+            path = "test_keep_up_size_ratio",
+            keep_free_space_size_ratio = 0.5,
+            keep_free_space_elements_ratio = {elements_ratio},
+            disk = hdd_blob),
+        min_bytes_for_wide_part = 10485760;
+
+INSERT INTO test SELECT randomString(200);
+    """
+    )
+
+    query_id = "test_keep_up_size_ratio_1"
+    node.query(
+        "SELECT * FROM test FORMAT Null SETTINGS enable_filesystem_cache_log = 1",
+        query_id=query_id,
+    )
+    count = int(
+        node.query(
+            f"""
+    SYSTEM FLUSH LOGS;
+    SELECT uniqExact(concat(key, toString(offset)))
+    FROM system.filesystem_cache_log
+    WHERE read_type = 'READ_FROM_FS_AND_DOWNLOADED_TO_CACHE';
+    """
+        )
+    )
+    assert count > max_elements
+
+    expected = 10
+    for _ in range(100):
+        elements = int(
+            node.query(
+                f"SELECT count() FROM system.filesystem_cache WHERE cache_name = '{cache_name}'"
+            )
+        )
+        if elements <= expected:
+            break
+        time.sleep(1)
+    assert elements <= expected

From ed6e4fbe162e9398a5f3b2ce24ed4afb31141f97 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Mon, 8 Apr 2024 13:56:15 +0000
Subject: [PATCH 024/192] Improve trivial insert select from files, add
 separate max_parsing_threads setting

---
 src/Core/Settings.h                           |  1 +
 src/Core/SettingsChangesHistory.h             |  1 +
 src/Formats/FormatFactory.cpp                 |  2 +-
 src/Storages/StorageFile.cpp                  |  2 +-
 src/Storages/StorageS3.cpp                    |  4 +--
 src/Storages/StorageURL.cpp                   |  6 ++--
 .../trivial_insert_select_from_files.xml      | 30 +++++++++++++++++++
 7 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 tests/performance/trivial_insert_select_from_files.xml

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 84e709294aa..67a12d1e6d5 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -50,6 +50,7 @@ class IColumn;
     M(MaxThreads, max_threads, 0, "The maximum number of threads to execute the request. By default, it is determined automatically.", 0) \
     M(Bool, use_concurrency_control, true, "Respect the server's concurrency control (see the `concurrent_threads_soft_limit_num` and `concurrent_threads_soft_limit_ratio_to_cores` global server settings). If disabled, it allows using a larger number of threads even if the server is overloaded (not recommended for normal usage, and needed mostly for tests).", 0) \
     M(MaxThreads, max_download_threads, 4, "The maximum number of threads to download data (e.g. for URL engine).", 0) \
+    M(MaxThreads, max_parsing_threads, 0, "The maximum number of threads to parse data in input formats that support parallel parsing. By default, it is determined automatically", 0) \
     M(UInt64, max_download_buffer_size, 10*1024*1024, "The maximal size of buffer for parallel downloading (e.g. for URL engine) per each thread.", 0) \
     M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \
     M(UInt64, max_read_buffer_size_local_fs, 128*1024, "The maximum size of the buffer to read from local filesystem. If set to 0 then max_read_buffer_size will be used.", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index 768b6aa6cbd..a26e215ce35 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -86,6 +86,7 @@ namespace SettingsChangesHistory
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
     {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"},
+              {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"},
               }},
     {"24.3", {{"s3_connect_timeout_ms", 1000, 1000, "Introduce new dedicated setting for s3 connection timeout"},
               {"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"},
diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index 8cbb1b9e563..1cc3dabd954 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -303,7 +303,7 @@ InputFormatPtr FormatFactory::getInput(
 
     auto format_settings = _format_settings ? *_format_settings : getFormatSettings(context);
     const Settings & settings = context->getSettingsRef();
-    size_t max_parsing_threads = _max_parsing_threads.value_or(settings.max_threads);
+    size_t max_parsing_threads = _max_parsing_threads.value_or(settings.max_parsing_threads);
     size_t max_download_threads = _max_download_threads.value_or(settings.max_download_threads);
 
     RowInputFormatParams row_input_format_params;
diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp
index 0d220f2fd5d..54ba567b838 100644
--- a/src/Storages/StorageFile.cpp
+++ b/src/Storages/StorageFile.cpp
@@ -1392,7 +1392,7 @@ Chunk StorageFileSource::generate()
 
             chassert(file_num > 0);
 
-            const auto max_parsing_threads = std::max<size_t>(settings.max_threads / file_num, 1UL);
+            const auto max_parsing_threads = std::max<size_t>(settings.max_parsing_threads / file_num, 1UL);
             input_format = FormatFactory::instance().getInput(
                 storage->format_name, *read_buf, block_for_format, getContext(), max_block_size, storage->format_settings,
                 max_parsing_threads, std::nullopt, /*is_remote_fs*/ false, CompressionMethod::None, need_only_count);
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 2d3aef312bf..c7b80f4912a 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -1206,8 +1206,8 @@ void ReadFromStorageS3Step::initializePipeline(QueryPipelineBuilder & pipeline,
         /// Disclosed glob iterator can underestimate the amount of keys in some cases. We will keep one stream for this particular case.
         num_streams = 1;
 
-    const size_t max_threads = context->getSettingsRef().max_threads;
-    const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / std::max(num_streams, 1ul));
+    const auto & settings = context->getSettingsRef();
+    const size_t max_parsing_threads = num_streams >= settings.max_parsing_threads ? 1 : (settings.max_parsing_threads / std::max(num_streams, 1ul));
     LOG_DEBUG(getLogger("StorageS3"), "Reading in {} streams, {} threads per stream", num_streams, max_parsing_threads);
 
     Pipes pipes;
diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp
index cc46cc8f8dc..be23ff50647 100644
--- a/src/Storages/StorageURL.cpp
+++ b/src/Storages/StorageURL.cpp
@@ -1172,8 +1172,8 @@ void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const Buil
     Pipes pipes;
     pipes.reserve(num_streams);
 
-    const size_t max_threads = context->getSettingsRef().max_threads;
-    const size_t max_parsing_threads = num_streams >= max_threads ? 1 : (max_threads / num_streams);
+    const auto & settings = context->getSettingsRef();
+    const size_t max_parsing_threads = num_streams >= settings.max_parsing_threads ? 1 : (settings.max_parsing_threads  / num_streams);
 
     for (size_t i = 0; i < num_streams; ++i)
     {
@@ -1204,7 +1204,7 @@ void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const Buil
 
     auto pipe = Pipe::unitePipes(std::move(pipes));
     size_t output_ports = pipe.numOutputPorts();
-    const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages;
+    const bool parallelize_output = settings.parallelize_output_from_storages;
     if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < max_num_streams)
         pipe.resize(max_num_streams);
 
diff --git a/tests/performance/trivial_insert_select_from_files.xml b/tests/performance/trivial_insert_select_from_files.xml
new file mode 100644
index 00000000000..f6ec6500c10
--- /dev/null
+++ b/tests/performance/trivial_insert_select_from_files.xml
@@ -0,0 +1,30 @@
+<test>
+
+<substitutions>
+    <substitution>
+        <name>format</name>
+        <values>
+            <value>TabSeparated</value>
+            <value>TabSeparatedWithNames</value>
+            <value>TabSeparatedWithNamesAndTypes</value>
+            <value>CSV</value>
+            <value>CSVWithNames</value>
+            <value>JSONEachRow</value>
+            <value>JSONCompactEachRow</value>
+            <value>JSONCompactEachRowWithNamesAndTypes</value>
+            <value>TSKV</value>
+        </values>
+    </substitution>
+</substitutions>
+    
+<create_query>CREATE TABLE IF NOT EXISTS table_src_{format} ENGINE = File({format}) AS test.hits</create_query>
+<create_query>CREATE TABLE IF NOT EXISTS table_dst_{format} AS test.hits</create_query>
+
+<fill_query>INSERT INTO table_src_{format} SELECT * FROM test.hits LIMIT 100000</fill_query>
+
+<query>INSERT INTO table_dst_{format} SELECT * FROM table_src_{format}</query>
+
+<drop_query>DROP TABLE IF EXISTS table_src_{format}</drop_query>
+<drop_query>DROP TABLE IF EXISTS table_dst_{format}</drop_query>
+
+</test>

From 19f00e7c42bd9ba0af63be4987a89aad3457b3c8 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Tue, 9 Apr 2024 12:33:18 +0200
Subject: [PATCH 025/192] Restart CI

---
 tests/performance/trivial_insert_select_from_files.xml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/performance/trivial_insert_select_from_files.xml b/tests/performance/trivial_insert_select_from_files.xml
index f6ec6500c10..64bd3e54b95 100644
--- a/tests/performance/trivial_insert_select_from_files.xml
+++ b/tests/performance/trivial_insert_select_from_files.xml
@@ -19,7 +19,6 @@
     
 <create_query>CREATE TABLE IF NOT EXISTS table_src_{format} ENGINE = File({format}) AS test.hits</create_query>
 <create_query>CREATE TABLE IF NOT EXISTS table_dst_{format} AS test.hits</create_query>
-
 <fill_query>INSERT INTO table_src_{format} SELECT * FROM test.hits LIMIT 100000</fill_query>
 
 <query>INSERT INTO table_dst_{format} SELECT * FROM table_src_{format}</query>

From 41e0720c0e4e1f3dd1262ffc5932285e700c8dd4 Mon Sep 17 00:00:00 2001
From: zhongyuankai <zhongyuankai@didiglobal.com>
Date: Wed, 10 Apr 2024 09:50:47 +0800
Subject: [PATCH 026/192] cmment

---
 src/Parsers/ASTDropQuery.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Parsers/ASTDropQuery.h b/src/Parsers/ASTDropQuery.h
index 7e25e990bc8..e18043b771b 100644
--- a/src/Parsers/ASTDropQuery.h
+++ b/src/Parsers/ASTDropQuery.h
@@ -37,13 +37,13 @@ public:
 
     bool sync{false};
 
-    // We detach the object permanently, so it will not be reattached back during server restart.
+    /// We detach the object permanently, so it will not be reattached back during server restart.
     bool permanently{false};
 
-    /// Used to drop multiple tables only, example: Drop TABLE t1, t2, t3...
+    /// Used to drop multiple tables only, example: DROP TABLE t1, t2, t3...
     ASTPtr database_and_tables;
 
-    /** Get the text that identifies this element. */
+    /// Get the text that identifies this element.
     String getID(char) const override;
     ASTPtr clone() const override;
 
@@ -52,7 +52,7 @@ public:
         return removeOnCluster<ASTDropQuery>(clone(), params.default_database);
     }
 
-    /** Convert an AST that deletes multiple tables into multiple ASTs that delete a single table. */
+    /// Convert an AST that deletes multiple tables into multiple ASTs that delete a single table.
     ASTs getRewrittenASTsOfSingleTable();
 
     QueryKind getQueryKind() const override { return QueryKind::Drop; }

From b6b7c3f80f6461e8198e9d70f6b4742f62d6435d Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Wed, 10 Apr 2024 13:41:21 +0200
Subject: [PATCH 027/192] Fxi

---
 src/Interpreters/Cache/FileCache.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 564dd03dd3c..29836dc1d15 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -191,9 +191,10 @@ void FileCache::initialize()
     metadata.startup();
 
     if (keep_current_size_to_max_ratio != 1 || keep_current_elements_to_max_ratio != 1)
+    {
         keep_up_free_space_ratio_task = Context::getGlobalContextInstance()->getSchedulePool().createTask(log->name(), [this] { freeSpaceRatioKeepingThreadFunc(); });
-
-    keep_up_free_space_ratio_task->schedule();
+        keep_up_free_space_ratio_task->schedule();
+    }
 
     is_initialized = true;
 }

From dc07496fd668f5db3e28a123fbfb9347b3ce7c63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Wed, 10 Apr 2024 15:45:03 +0000
Subject: [PATCH 028/192] Fix validation of special MergeTree columns

---
 src/Storages/MergeTree/MergeTreeData.cpp      | 90 ++++++++++++++++++-
 .../03093_special_column_errors.reference     |  0
 .../03093_special_column_errors.sql           | 36 ++++++++
 3 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 tests/queries/0_stateless/03093_special_column_errors.reference
 create mode 100644 tests/queries/0_stateless/03093_special_column_errors.sql

diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 3e4350c2fbb..7eb862c8c72 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -861,6 +861,42 @@ void MergeTreeData::checkTTLExpressions(const StorageInMemoryMetadata & new_meta
     }
 }
 
+namespace
+{
+template <typename TMustHaveDataType>
+void checkSpecialColumn(const std::string_view column_meta_name, const AlterCommand & command)
+{
+    if (command.type == AlterCommand::MODIFY_COLUMN)
+    {
+        if (!typeid_cast<const TMustHaveDataType *>(command.data_type.get()))
+        {
+            throw Exception(
+                ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
+                "Cannot alter {} column ({}) to type {}, because it must have type {}",
+                column_meta_name,
+                command.column_name,
+                command.data_type->getName(),
+                TypeName<TMustHaveDataType>);
+        }
+    }
+    else if (command.type == AlterCommand::DROP_COLUMN)
+    {
+        throw Exception(
+            ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
+            "Trying to ALTER DROP {} ({}) column",
+            column_meta_name,
+            backQuoteIfNeed(command.column_name));
+    }
+    else if (command.type == AlterCommand::RENAME_COLUMN)
+    {
+        throw Exception(
+            ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
+            "Trying to ALTER RENAME {} ({}) column",
+            column_meta_name,
+            backQuoteIfNeed(command.column_name));
+    }
+};
+}
 
 void MergeTreeData::checkStoragePolicy(const StoragePolicyPtr & new_storage_policy) const
 {
@@ -993,10 +1029,20 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat
             {
                 return column_to_sum == Nested::extractTableName(name_and_type.name);
             };
-            if (columns.end() == std::find_if(columns.begin(), columns.end(), check_column_to_sum_exists))
-                throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE,
-                                "Column {} listed in columns to sum does not exist in table declaration.",
-                                column_to_sum);
+            const auto column_it = std::find_if(columns.begin(), columns.end(), check_column_to_sum_exists);
+
+            if (columns.end() == column_it)
+            {
+                throw Exception(
+                    ErrorCodes::NO_SUCH_COLUMN_IN_TABLE,
+                    "Column {} listed in columns to sum does not exist in table declaration",
+                    column_to_sum);
+            }
+            else if (!isNumber(column_it->type))
+            {
+                throw Exception(
+                    ErrorCodes::BAD_TYPE_OF_FIELD, "Column {} listed in columns to sum does not have number type", column_to_sum);
+            }
         }
 
         /// Check that summing columns are not in partition key.
@@ -3114,6 +3160,42 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
             }
         }
 
+        // Technically it is possible to specify the same column for `version` and `is_deleted`, thus let's be sure and don't use -if-else here
+        if (command.column_name == merging_params.is_deleted_column)
+        {
+            checkSpecialColumn<DataTypeUInt8>("is_deleted", command);
+        }
+        else if (command.column_name == merging_params.sign_column)
+        {
+            checkSpecialColumn<DataTypeUInt8>("sign", command);
+        }
+        else if (std::ranges::any_of(
+                     merging_params.columns_to_sum, [&](const String & column_to_sum) { return command.column_name == column_to_sum; }))
+        {
+            if (command.type == AlterCommand::MODIFY_COLUMN && !isNumber(command.data_type))
+            {
+                throw Exception(
+                    ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
+                    "Cannot alter summarizing column ({}) to type {}, because it must have numeric type",
+                    command.column_name,
+                    command.data_type->getName());
+            }
+            else if (command.type == AlterCommand::DROP_COLUMN)
+            {
+                throw Exception(
+                    ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
+                    "Trying to ALTER DROP summarizing column ({})",
+                    backQuoteIfNeed(command.column_name));
+            }
+            else if (command.type == AlterCommand::RENAME_COLUMN)
+            {
+                throw Exception(
+                    ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
+                    "Trying to ALTER RENAME summarizing column ({})",
+                    backQuoteIfNeed(command.column_name));
+            }
+        }
+
         if (command.type == AlterCommand::MODIFY_QUERY)
             throw Exception(ErrorCodes::NOT_IMPLEMENTED,
                             "ALTER MODIFY QUERY is not supported by MergeTree engines family");
diff --git a/tests/queries/0_stateless/03093_special_column_errors.reference b/tests/queries/0_stateless/03093_special_column_errors.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/03093_special_column_errors.sql b/tests/queries/0_stateless/03093_special_column_errors.sql
new file mode 100644
index 00000000000..2bff551738e
--- /dev/null
+++ b/tests/queries/0_stateless/03093_special_column_errors.sql
@@ -0,0 +1,36 @@
+CREATE TABLE replacing_wrong (key Int64, ver Int64, is_deleted UInt16) ENGINE = ReplacingMergeTree(ver, is_deleted) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
+CREATE TABLE replacing_wrong (key Int64, ver String, is_deleted UInt8) ENGINE = ReplacingMergeTree(ver, is_deleted) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
+
+CREATE TABLE replacing (key Int64, ver Int64, is_deleted UInt8) ENGINE = ReplacingMergeTree(ver, is_deleted) ORDER BY key;
+ALTER TABLE replacing MODIFY COLUMN ver String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE replacing MODIFY COLUMN ver Int128;
+ALTER TABLE replacing MODIFY COLUMN is_deleted String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE replacing MODIFY COLUMN is_deleted UInt16; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE replacing MODIFY COLUMN is_deleted Int8; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE replacing DROP COLUMN ver; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE replacing DROP COLUMN is_deleted; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE replacing RENAME COLUMN ver TO ver2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE replacing RENAME COLUMN is_deleted TO is_deleted2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+
+CREATE TABLE summing_wrong (key Int64, sum1 Int64, sum2 String) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
+CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
+CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree(sum_doesnt_exists) ORDER BY key; -- { serverError NO_SUCH_COLUMN_IN_TABLE }
+
+CREATE TABLE summing (key Int64, sum1 Int64, sum2 UInt64, not_sum String) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key;
+ALTER TABLE summing MODIFY COLUMN sum1 Int32, MODIFY COLUMN sum2 IPv4; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE summing MODIFY COLUMN sum2 String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE summing MODIFY COLUMN sum1 Int32, MODIFY COLUMN sum2 UInt256;
+ALTER TABLE summing DROP COLUMN sum1; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE summing DROP COLUMN sum2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE summing RENAME COLUMN sum1 TO sum3; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE summing RENAME COLUMN sum2 TO sum3; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE summing RENAME COLUMN not_sum TO still_not_sum;
+
+CREATE TABLE collapsing_wrong (key Int64, sign Int16) ENGINE = CollapsingMergeTree(sign) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
+CREATE TABLE collapsing_wrong (key Int64, sign UInt8) ENGINE = CollapsingMergeTree(sign) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
+CREATE TABLE collapsing_wrong (key Int64, sign UInt8) ENGINE = CollapsingMergeTree(not_existing) ORDER BY key; -- { serverError NO_SUCH_COLUMN_IN_TABLE }
+
+CREATE TABLE collapsing (key Int64, sign Int8) ENGINE = CollapsingMergeTree(sign) ORDER BY key;
+ALTER TABLE collapsing MODIFY COLUMN sign String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE collapsing DROP COLUMN sign; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+ALTER TABLE collapsing RENAME COLUMN sign TO sign2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }

From 05f55504bb487148d5ced68feb2f9e1ad9ca2597 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Wed, 10 Apr 2024 19:32:11 +0200
Subject: [PATCH 029/192] Fix schedule

---
 src/Interpreters/Cache/FileCache.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 29836dc1d15..0a2f0da0305 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -988,12 +988,13 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
     FileCacheReserveStat stat;
     EvictionCandidates eviction_candidates;
 
+    bool limits_satisfied = true;
     try
     {
         /// Collect at most `keep_up_free_space_remove_batch` elements to evict,
         /// (we use batches to make sure we do not block cache for too long,
         /// by default the batch size is quite small).
-        const bool limits_satisfied = main_priority->collectCandidatesForEviction(
+        limits_satisfied = main_priority->collectCandidatesForEviction(
             desired_size, desired_elements_num, keep_up_free_space_remove_batch, stat, eviction_candidates, lock);
 
 #ifdef ABORT_ON_LOGICAL_ERROR
@@ -1012,8 +1013,6 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
                         || current_elements_count <= desired_elements_num
                         || current_elements_count - stat.total_stat.releasable_count <= desired_elements_num);
         }
-#else
-        UNUSED(limits_satisfied);
 #endif
 
         if (shutdown)
@@ -1040,10 +1039,6 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
             lock.lock();
             eviction_candidates.finalize(nullptr, lock);
         }
-        else
-        {
-            keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
-        }
     }
     catch (...)
     {
@@ -1052,14 +1047,17 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
         if (eviction_candidates.size() > 0)
             eviction_candidates.finalize(nullptr, lockCache());
 
-        keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
-
         /// Let's catch such cases in ci,
         /// in general there should not be exceptions.
         chassert(false);
     }
 
     LOG_TRACE(log, "Free space ratio keeping thread finished in {} ms", watch.elapsedMilliseconds());
+
+    if (limits_satisfied)
+        keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
+    else
+        keep_up_free_space_ratio_task->schedule();
 }
 
 void FileCache::iterate(IterateFunc && func, const UserID & user_id)

From ce457e76256e397bea451db5c1012b8c49e8d70f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Thu, 11 Apr 2024 09:01:14 +0000
Subject: [PATCH 030/192] Add checks to prevent using the same column for
 multiple special columns

---
 src/Storages/MergeTree/MergeTreeData.cpp          | 15 ++++++++++++---
 .../0_stateless/03093_special_column_errors.sql   |  6 ++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 7eb862c8c72..8f97ec9cbc2 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1022,6 +1022,11 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat
 
     if (mode == MergingParams::Summing)
     {
+        auto columns_to_sum_copy = columns_to_sum;
+        std::sort(columns_to_sum_copy.begin(), columns_to_sum_copy.end());
+        if (const auto it = std::adjacent_find(columns_to_sum_copy.begin(), columns_to_sum_copy.end()); it != columns_to_sum_copy.end())
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Column {} is listed multiple times in the list of columns to sum", *it);
+
         /// If columns_to_sum are set, then check that such columns exist.
         for (const auto & column_to_sum : columns_to_sum)
         {
@@ -1063,12 +1068,18 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat
 
     if (mode == MergingParams::Replacing)
     {
+        if (version_column == is_deleted_column)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "The version and is_deleted column cannot be the same column ({})", version_column);
+
         check_is_deleted_column(true, "ReplacingMergeTree");
         check_version_column(true, "ReplacingMergeTree");
     }
 
     if (mode == MergingParams::VersionedCollapsing)
     {
+        if (version_column == sign_column)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "The version and sign column cannot be the same column ({})", version_column);
+
         check_sign_column(false, "VersionedCollapsingMergeTree");
         check_version_column(false, "VersionedCollapsingMergeTree");
     }
@@ -3159,9 +3170,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
                     "Trying to ALTER RENAME version {} column", backQuoteIfNeed(command.column_name));
             }
         }
-
-        // Technically it is possible to specify the same column for `version` and `is_deleted`, thus let's be sure and don't use -if-else here
-        if (command.column_name == merging_params.is_deleted_column)
+        else if (command.column_name == merging_params.is_deleted_column)
         {
             checkSpecialColumn<DataTypeUInt8>("is_deleted", command);
         }
diff --git a/tests/queries/0_stateless/03093_special_column_errors.sql b/tests/queries/0_stateless/03093_special_column_errors.sql
index 2bff551738e..bbdf1bb7183 100644
--- a/tests/queries/0_stateless/03093_special_column_errors.sql
+++ b/tests/queries/0_stateless/03093_special_column_errors.sql
@@ -1,5 +1,6 @@
 CREATE TABLE replacing_wrong (key Int64, ver Int64, is_deleted UInt16) ENGINE = ReplacingMergeTree(ver, is_deleted) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
 CREATE TABLE replacing_wrong (key Int64, ver String, is_deleted UInt8) ENGINE = ReplacingMergeTree(ver, is_deleted) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
+CREATE TABLE replacing_wrong (key Int64, ver Int64, is_deleted UInt8) ENGINE = ReplacingMergeTree(is_deleted, is_deleted) ORDER BY key; -- { serverError BAD_ARGUMENTS }
 
 CREATE TABLE replacing (key Int64, ver Int64, is_deleted UInt8) ENGINE = ReplacingMergeTree(ver, is_deleted) ORDER BY key;
 ALTER TABLE replacing MODIFY COLUMN ver String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
@@ -15,6 +16,7 @@ ALTER TABLE replacing RENAME COLUMN is_deleted TO is_deleted2; -- { serverError
 CREATE TABLE summing_wrong (key Int64, sum1 Int64, sum2 String) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
 CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
 CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree(sum_doesnt_exists) ORDER BY key; -- { serverError NO_SUCH_COLUMN_IN_TABLE }
+CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree((sum1, sum2, sum1)) ORDER BY key; -- { serverError BAD_ARGUMENTS }
 
 CREATE TABLE summing (key Int64, sum1 Int64, sum2 UInt64, not_sum String) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key;
 ALTER TABLE summing MODIFY COLUMN sum1 Int32, MODIFY COLUMN sum2 IPv4; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
@@ -34,3 +36,7 @@ CREATE TABLE collapsing (key Int64, sign Int8) ENGINE = CollapsingMergeTree(sign
 ALTER TABLE collapsing MODIFY COLUMN sign String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
 ALTER TABLE collapsing DROP COLUMN sign; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
 ALTER TABLE collapsing RENAME COLUMN sign TO sign2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
+
+CREATE TABLE versioned_collapsing_wrong (key Int64, version UInt8, sign Int8) ENGINE = VersionedCollapsingMergeTree(sign, sign) ORDER BY key; -- { serverError BAD_ARGUMENTS }
+
+CREATE TABLE versioned_collapsing (key Int64, version UInt8, sign Int8) ENGINE = VersionedCollapsingMergeTree(sign, version) ORDER BY key;

From 583bdff2bc51c3dbff275d76fbdbd7e9e209915c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Thu, 11 Apr 2024 10:57:33 +0000
Subject: [PATCH 031/192] Abandon validating the special columns of
 SummingMergeTree, as they are way more complex

---
 src/Storages/MergeTree/MergeTreeData.cpp      | 39 +------------------
 .../03093_special_column_errors.sql           | 15 -------
 2 files changed, 2 insertions(+), 52 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 8f97ec9cbc2..d09bb23de90 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1034,20 +1034,11 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat
             {
                 return column_to_sum == Nested::extractTableName(name_and_type.name);
             };
-            const auto column_it = std::find_if(columns.begin(), columns.end(), check_column_to_sum_exists);
-
-            if (columns.end() == column_it)
-            {
+            if (columns.end() == std::find_if(columns.begin(), columns.end(), check_column_to_sum_exists))
                 throw Exception(
                     ErrorCodes::NO_SUCH_COLUMN_IN_TABLE,
-                    "Column {} listed in columns to sum does not exist in table declaration",
+                    "Column {} listed in columns to sum does not exist in table declaration.",
                     column_to_sum);
-            }
-            else if (!isNumber(column_it->type))
-            {
-                throw Exception(
-                    ErrorCodes::BAD_TYPE_OF_FIELD, "Column {} listed in columns to sum does not have number type", column_to_sum);
-            }
         }
 
         /// Check that summing columns are not in partition key.
@@ -3178,32 +3169,6 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
         {
             checkSpecialColumn<DataTypeUInt8>("sign", command);
         }
-        else if (std::ranges::any_of(
-                     merging_params.columns_to_sum, [&](const String & column_to_sum) { return command.column_name == column_to_sum; }))
-        {
-            if (command.type == AlterCommand::MODIFY_COLUMN && !isNumber(command.data_type))
-            {
-                throw Exception(
-                    ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
-                    "Cannot alter summarizing column ({}) to type {}, because it must have numeric type",
-                    command.column_name,
-                    command.data_type->getName());
-            }
-            else if (command.type == AlterCommand::DROP_COLUMN)
-            {
-                throw Exception(
-                    ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
-                    "Trying to ALTER DROP summarizing column ({})",
-                    backQuoteIfNeed(command.column_name));
-            }
-            else if (command.type == AlterCommand::RENAME_COLUMN)
-            {
-                throw Exception(
-                    ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN,
-                    "Trying to ALTER RENAME summarizing column ({})",
-                    backQuoteIfNeed(command.column_name));
-            }
-        }
 
         if (command.type == AlterCommand::MODIFY_QUERY)
             throw Exception(ErrorCodes::NOT_IMPLEMENTED,
diff --git a/tests/queries/0_stateless/03093_special_column_errors.sql b/tests/queries/0_stateless/03093_special_column_errors.sql
index bbdf1bb7183..1464927db7e 100644
--- a/tests/queries/0_stateless/03093_special_column_errors.sql
+++ b/tests/queries/0_stateless/03093_special_column_errors.sql
@@ -13,21 +13,6 @@ ALTER TABLE replacing DROP COLUMN is_deleted; -- { serverError ALTER_OF_COLUMN_I
 ALTER TABLE replacing RENAME COLUMN ver TO ver2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
 ALTER TABLE replacing RENAME COLUMN is_deleted TO is_deleted2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
 
-CREATE TABLE summing_wrong (key Int64, sum1 Int64, sum2 String) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
-CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
-CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree(sum_doesnt_exists) ORDER BY key; -- { serverError NO_SUCH_COLUMN_IN_TABLE }
-CREATE TABLE summing_wrong (key Int64, sum1 String, sum2 Int64) ENGINE = SummingMergeTree((sum1, sum2, sum1)) ORDER BY key; -- { serverError BAD_ARGUMENTS }
-
-CREATE TABLE summing (key Int64, sum1 Int64, sum2 UInt64, not_sum String) ENGINE = SummingMergeTree((sum1, sum2)) ORDER BY key;
-ALTER TABLE summing MODIFY COLUMN sum1 Int32, MODIFY COLUMN sum2 IPv4; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
-ALTER TABLE summing MODIFY COLUMN sum2 String; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
-ALTER TABLE summing MODIFY COLUMN sum1 Int32, MODIFY COLUMN sum2 UInt256;
-ALTER TABLE summing DROP COLUMN sum1; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
-ALTER TABLE summing DROP COLUMN sum2; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
-ALTER TABLE summing RENAME COLUMN sum1 TO sum3; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
-ALTER TABLE summing RENAME COLUMN sum2 TO sum3; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN }
-ALTER TABLE summing RENAME COLUMN not_sum TO still_not_sum;
-
 CREATE TABLE collapsing_wrong (key Int64, sign Int16) ENGINE = CollapsingMergeTree(sign) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
 CREATE TABLE collapsing_wrong (key Int64, sign UInt8) ENGINE = CollapsingMergeTree(sign) ORDER BY key; -- { serverError BAD_TYPE_OF_FIELD }
 CREATE TABLE collapsing_wrong (key Int64, sign UInt8) ENGINE = CollapsingMergeTree(not_existing) ORDER BY key; -- { serverError NO_SUCH_COLUMN_IN_TABLE }

From 78c34916f93c11a913bc16149e028c9b688c902d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Thu, 11 Apr 2024 13:44:23 +0000
Subject: [PATCH 032/192] Fix incorrect error when no special columns are
 specified

---
 src/Storages/MergeTree/MergeTreeData.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index d09bb23de90..7a138331207 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1059,7 +1059,7 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat
 
     if (mode == MergingParams::Replacing)
     {
-        if (version_column == is_deleted_column)
+        if (!version_column.empty() && version_column == is_deleted_column)
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "The version and is_deleted column cannot be the same column ({})", version_column);
 
         check_is_deleted_column(true, "ReplacingMergeTree");
@@ -1068,7 +1068,7 @@ void MergeTreeData::MergingParams::check(const StorageInMemoryMetadata & metadat
 
     if (mode == MergingParams::VersionedCollapsing)
     {
-        if (version_column == sign_column)
+        if (!version_column.empty() && version_column == sign_column)
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "The version and sign column cannot be the same column ({})", version_column);
 
         check_sign_column(false, "VersionedCollapsingMergeTree");

From d3a58ffbbd5e5efc5c0c9679d4b56737fe6de40f Mon Sep 17 00:00:00 2001
From: pet74alex <167422282+pet74alex@users.noreply.github.com>
Date: Mon, 22 Apr 2024 19:22:40 +0300
Subject: [PATCH 033/192] Added generateUUIDv7* functions

---
 src/Functions/generateUUIDv7.cpp | 373 +++++++++++++++++++++++++++++++
 1 file changed, 373 insertions(+)
 create mode 100644 src/Functions/generateUUIDv7.cpp

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
new file mode 100644
index 00000000000..4bd540d28db
--- /dev/null
+++ b/src/Functions/generateUUIDv7.cpp
@@ -0,0 +1,373 @@
+#include <DataTypes/DataTypeUUID.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionsRandom.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+
+namespace
+{
+constexpr auto bits_in_counter = 42;
+constexpr uint64_t counter_limit = (uint64_t{1} << bits_in_counter);
+constexpr uint8_t random_data_offset = 6;
+constexpr uint8_t random_data_count = 10;
+constexpr uint8_t next_count_random_data_offset = 12;
+constexpr uint8_t next_count_random_data_count = 4;
+
+using UUIDAsArray = std::array<uint8_t, 16>;
+
+inline uint64_t getTimestampMs()
+{
+    timespec tp;
+    clock_gettime(CLOCK_REALTIME, &tp);
+    const uint64_t sec = tp.tv_sec;
+    return sec * 1000 + tp.tv_nsec / 1000000;
+}
+
+inline void fillTimestamp(UUIDAsArray & uuid, uint64_t timestamp)
+{
+    uuid[0] = (timestamp >> 40) & 0xFF;
+    uuid[1] = (timestamp >> 32) & 0xFF;
+    uuid[2] = (timestamp >> 24) & 0xFF;
+    uuid[3] = (timestamp >> 16) & 0xFF;
+    uuid[4] = (timestamp >> 8) & 0xFF;
+    uuid[5] = (timestamp)&0xFF;
+}
+}
+
+#define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \
+    DECLARE_DEFAULT_CODE(__VA_ARGS__) \
+    DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__)
+
+DECLARE_SEVERAL_IMPLEMENTATIONS(
+
+    namespace UUIDv7Impl {
+        inline void store(UUID & new_uuid, UUIDAsArray & uuid)
+        {
+            uuid[6] = (uuid[6] & 0x0f) | 0x70; // version 7
+            uuid[8] = (uuid[8] & 0x3f) | 0x80; // variant 2
+
+            DB::UUIDHelpers::getHighBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data());
+            DB::UUIDHelpers::getLowBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data() + 8);
+        }
+
+        struct UUIDv7Base
+        {
+            UUIDAsArray & uuid;
+            UUIDv7Base(UUIDAsArray & u) : uuid(u) { }
+        };
+
+        struct RandomData
+        {
+            static constexpr auto name = "generateUUIDv7";
+            struct Data : UUIDv7Base
+            {
+                UUIDAsArray uuid_data;
+
+                Data() : UUIDv7Base(uuid_data) { }
+
+                void generate(UUID & new_uuid)
+                {
+                    fillTimestamp(uuid, getTimestampMs());
+                    memcpy(uuid.data() + random_data_offset, &new_uuid, random_data_count);
+                    store(new_uuid, uuid);
+                }
+            };
+        };
+
+        struct CounterDataCommon : UUIDv7Base
+        {
+            CounterDataCommon(UUIDAsArray & u) : UUIDv7Base(u) { }
+
+            uint64_t getCounter()
+            {
+                uint64_t counter = uuid[6] & 0x0f;
+                counter = (counter << 8) | uuid[7];
+                counter = (counter << 6) | (uuid[8] & 0x3f);
+                counter = (counter << 8) | uuid[9];
+                counter = (counter << 8) | uuid[10];
+                counter = (counter << 8) | uuid[11];
+                return counter;
+            }
+
+            void generate(UUID & newUUID)
+            {
+                uint64_t timestamp = 0;
+                /// Get timestamp of the previous uuid
+                for (int i = 0; i != 6; ++i)
+                {
+                    timestamp = (timestamp << 8) | uuid[i];
+                }
+
+                const uint64_t unix_time_ms = getTimestampMs();
+                // continue incrementing counter when clock slightly goes back or when counter overflow happened during the previous UUID generation
+                bool need_to_increment_counter = (timestamp == unix_time_ms || timestamp < unix_time_ms + 10000);
+                uint64_t counter = 0;
+                if (need_to_increment_counter)
+                {
+                    counter = getCounter();
+                }
+                else
+                {
+                    timestamp = unix_time_ms;
+                }
+
+                bool counter_incremented = false;
+                if (need_to_increment_counter)
+                {
+                    if (++counter == counter_limit)
+                    {
+                        ++timestamp;
+                        // counter bytes will be filled by the random data
+                    }
+                    else
+                    {
+                        uuid[6] = counter >> 38;
+                        uuid[7] = counter >> 30;
+                        uuid[8] = counter >> 24;
+                        uuid[9] = counter >> 16;
+                        uuid[10] = counter >> 8;
+                        uuid[11] = counter;
+                        counter_incremented = true;
+                    }
+                }
+
+                fillTimestamp(uuid, timestamp);
+
+                // Get the required number of random bytes: 4 in the case of incrementing existing counter, 10 in the case of renewing counter
+                memcpy(
+                    uuid.data() + (counter_incremented ? next_count_random_data_offset : random_data_offset),
+                    &newUUID,
+                    counter_incremented ? next_count_random_data_count : random_data_count);
+
+                store(newUUID, uuid);
+            }
+        };
+
+        struct ThreadLocalCounter
+        {
+            static constexpr auto name = "generateUUIDv7WithFastCounter";
+            struct Data : CounterDataCommon
+            {
+                // Implement counter monotony only within one thread so function doesn't require mutexes and doesn't affect performance of the same function running simultenaously on other threads
+                static inline thread_local UUIDAsArray uuid_data;
+                Data() : CounterDataCommon(uuid_data) { }
+            };
+        };
+
+        struct GlobalCounter
+        {
+            static constexpr auto name = "generateUUIDv7WithCounter";
+            struct Data : std::lock_guard<std::mutex>, CounterDataCommon
+            {
+                // Implement counter monotony whithin one timestamp accross all threads generating UUIDv7 with counter simultaneously
+                static inline UUIDAsArray uuid_data;
+                static inline std::mutex mtx;
+                Data() : std::lock_guard<std::mutex>(mtx), CounterDataCommon(uuid_data) { }
+            };
+        };
+    } // namespace UUIDv7Impl
+
+
+    template <typename FillPolicy>
+    class FunctionGenerateUUIDv7Base
+    : public IFunction,
+      public FillPolicy {
+      public:
+          using FillPolicy::name;
+          using FillPolicyData = typename FillPolicy::Data;
+
+          FunctionGenerateUUIDv7Base() = default;
+
+          String getName() const final
+          {
+              return name;
+          }
+
+          size_t getNumberOfArguments() const final
+          {
+              return 0;
+          }
+
+          bool isDeterministicInScopeOfQuery() const final
+          {
+              return false;
+          }
+          bool useDefaultImplementationForNulls() const final
+          {
+              return false;
+          }
+          bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final
+          {
+              return false;
+          }
+          bool isVariadic() const final
+          {
+              return true;
+          }
+
+          DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+          {
+              if (arguments.size() > 1)
+                  throw Exception(
+                      ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                      "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
+                      getName(),
+                      arguments.size());
+
+              return std::make_shared<DataTypeUUID>();
+          }
+
+          bool isDeterministic() const override
+          {
+              return false;
+          }
+
+          ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
+          {
+              auto col_res = ColumnVector<UUID>::create();
+              typename ColumnVector<UUID>::Container & vec_to = col_res->getData();
+
+              size_t size = input_rows_count;
+              vec_to.resize(size);
+
+              /// RandImpl is target-dependent and is not the same in different TargetSpecific namespaces.
+              /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl
+              RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
+
+              for (UUID & new_uuid : vec_to)
+              {
+                  FillPolicyData data;
+                  data.generate(new_uuid);
+              }
+
+              return col_res;
+          }
+      };
+
+    using FunctionGenerateUUIDv7 = FunctionGenerateUUIDv7Base<UUIDv7Impl::RandomData>;
+    using FunctionGenerateUUIDv7WithCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::GlobalCounter>;
+    using FunctionGenerateUUIDv7WithFastCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::ThreadLocalCounter>;
+
+    ) // DECLARE_SEVERAL_IMPLEMENTATIONS
+#undef DECLARE_SEVERAL_IMPLEMENTATIONS
+
+
+class FunctionGenerateUUIDv7 : public TargetSpecific::Default::FunctionGenerateUUIDv7
+{
+public:
+    explicit FunctionGenerateUUIDv7(ContextPtr context) : selector(context)
+    {
+        selector.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionGenerateUUIDv7>();
+
+#if USE_MULTITARGET_CODE
+        selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionGenerateUUIDv7>();
+#endif
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
+    {
+        return selector.selectAndExecute(arguments, result_type, input_rows_count);
+    }
+
+    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionGenerateUUIDv7>(context); }
+
+private:
+    ImplementationSelector<IFunction> selector;
+};
+
+class FunctionGenerateUUIDv7WithCounter : public TargetSpecific::Default::FunctionGenerateUUIDv7WithCounter
+{
+public:
+    explicit FunctionGenerateUUIDv7WithCounter(ContextPtr context) : selector(context)
+    {
+        selector.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionGenerateUUIDv7WithCounter>();
+
+#if USE_MULTITARGET_CODE
+        selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionGenerateUUIDv7WithCounter>();
+#endif
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
+    {
+        return selector.selectAndExecute(arguments, result_type, input_rows_count);
+    }
+
+    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionGenerateUUIDv7WithCounter>(context); }
+
+private:
+    ImplementationSelector<IFunction> selector;
+};
+
+
+class FunctionGenerateUUIDv7WithFastCounter : public TargetSpecific::Default::FunctionGenerateUUIDv7WithFastCounter
+{
+public:
+    explicit FunctionGenerateUUIDv7WithFastCounter(ContextPtr context) : selector(context)
+    {
+        selector.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionGenerateUUIDv7WithFastCounter>();
+
+#if USE_MULTITARGET_CODE
+        selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionGenerateUUIDv7WithFastCounter>();
+#endif
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
+    {
+        return selector.selectAndExecute(arguments, result_type, input_rows_count);
+    }
+
+    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionGenerateUUIDv7WithFastCounter>(context); }
+
+private:
+    ImplementationSelector<IFunction> selector;
+};
+
+
+REGISTER_FUNCTION(GenerateUUIDv7)
+{
+    factory.registerFunction<FunctionGenerateUUIDv7>(
+        FunctionDocumentation{
+            .description = R"(
+Generates a UUID of version 7 with current Unix time having milliseconds precision followed by random data.
+This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
+The function returns a value of type UUID.
+)",
+            .examples{{"uuid", "SELECT generateUUIDv7()", ""}, {"multiple", "SELECT generateUUIDv7(1), generateUUIDv7(2)", ""}},
+            .categories{"UUID"}},
+        FunctionFactory::CaseSensitive);
+
+    factory.registerFunction<FunctionGenerateUUIDv7WithCounter>(
+        FunctionDocumentation{
+            .description = R"(
+Generates a UUID of version 7 with current Unix time having milliseconds precision, a monotonic counter within the same timestamp starting from the random value, and followed by 4 random bytes.
+This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
+The function returns a value of type UUID.
+)",
+            .examples{
+                {"uuid", "SELECT generateUUIDv7WithCounter()", ""},
+                {"multiple", "SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2)", ""}},
+            .categories{"UUID"}},
+        FunctionFactory::CaseSensitive);
+
+    factory.registerFunction<FunctionGenerateUUIDv7WithFastCounter>(
+        FunctionDocumentation{
+            .description = R"(
+Generates a UUID of version 7 with current Unix time having milliseconds precision, a monotonic counter within the same timestamp and the same request starting from the random value, and followed by 4 random bytes.
+This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
+This function is a little bit faster version of the function GenerateUUIDv7WithCounter. It doesn't guarantee the counter monotony withing the same timestamp accross different requests. It means that two UUIDs having 
+The function returns a value of type UUID.
+)",
+            .examples{
+                {"uuid", "SELECT generateUUIDv7WithFastCounter()", ""},
+                {"multiple", "SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(2)", ""}},
+            .categories{"UUID"}},
+        FunctionFactory::CaseSensitive);
+}
+
+}

From 142ce60b4a9c01ca865cc2cf531238d162d0b994 Mon Sep 17 00:00:00 2001
From: pet74alex <167422282+pet74alex@users.noreply.github.com>
Date: Mon, 22 Apr 2024 19:26:15 +0300
Subject: [PATCH 034/192] Added UUIDToNum and UUDv7ToDateTime functions

---
 src/Functions/FunctionsCodingUUID.cpp | 185 ++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)

diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index dd9170e44ad..dceff894c34 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -1,14 +1,18 @@
 #include <Columns/ColumnDecimal.h>
+#include <Columns/ColumnsDateTime.h>
 #include <Columns/ColumnFixedString.h>
 #include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnVector.h>
 #include <Common/BitHelpers.h>
 #include <base/hex.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypeFixedString.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/IFunction.h>
 #include <Functions/FunctionHelpers.h>
+#include <Functions/extractTimeZoneFromFunctionArguments.h>
 #include <IO/WriteHelpers.h>
 #include <Interpreters/Context_fwd.h>
 #include <Interpreters/castColumn.h>
@@ -319,10 +323,191 @@ public:
     }
 };
 
+
+class FunctionUUIDToNum : public IFunction
+{
+public:
+    static constexpr auto name = "UUIDToNum";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUUIDToNum>(); }
+
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 0; }
+    bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
+    bool isVariadic() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        checkArgumentCount(arguments, name);
+
+        if (!isUUID(arguments[0]))
+        {
+            throw Exception(
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Illegal type {} of first argument of function {}, expected UUID",
+                arguments[0]->getName(),
+                getName());
+        }
+
+        checkFormatArgument(arguments, name);
+
+        return std::make_shared<DataTypeFixedString>(uuid_bytes_length);
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    {
+        const ColumnWithTypeAndName & col_type_name = arguments[0];
+        const ColumnPtr & column = col_type_name.column;
+
+        const bool defaultFormat = (parseVariant(arguments) == UUIDSerializer::Variant::Default);
+        if (const auto * col_in = checkAndGetColumn<ColumnUUID>(column.get()))
+        {
+            const auto & vec_in = col_in->getData();
+            const UUID * uuids = vec_in.data();
+            const size_t size = vec_in.size();
+
+            auto col_res = ColumnFixedString::create(uuid_bytes_length);
+
+            ColumnString::Chars & vec_res = col_res->getChars();
+            vec_res.resize(size * uuid_bytes_length);
+
+            size_t dst_offset = 0;
+
+            for (size_t i = 0; i < size; ++i)
+            {
+                uint64_t hiBytes = DB::UUIDHelpers::getHighBytes(uuids[i]);
+                uint64_t loBytes = DB::UUIDHelpers::getLowBytes(uuids[i]);
+                unalignedStoreBigEndian<uint64_t>(&vec_res[dst_offset], hiBytes);
+                unalignedStoreBigEndian<uint64_t>(&vec_res[dst_offset + sizeof(hiBytes)], loBytes);
+                if (!defaultFormat)
+                {
+                    std::swap(vec_res[dst_offset], vec_res[dst_offset + 3]);
+                    std::swap(vec_res[dst_offset + 1], vec_res[dst_offset + 2]);
+                }
+                dst_offset += uuid_bytes_length;
+            }
+
+            return col_res;
+        }
+        else
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
+    }
+};
+
+class FunctionUUIDv7ToDateTime : public IFunction
+{
+public:
+    static constexpr auto name = "UUIDv7ToDateTime";
+    static constexpr UInt32 DATETIME_SCALE = 3;
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUUIDv7ToDateTime>(); }
+
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 0; }
+    bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
+    bool isVariadic() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (arguments.empty() || arguments.size() > 2)
+            throw Exception(
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of arguments for function {}: should be 1 or 2", getName());
+
+        if (!checkAndGetDataType<DataTypeUUID>(arguments[0].type.get()))
+        {
+            throw Exception(
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Illegal type {} of first argument of function {}, expected UUID",
+                arguments[0].type->getName(),
+                getName());
+        }
+
+        String timezone;
+        if (arguments.size() == 2)
+        {
+            timezone = extractTimeZoneNameFromColumn(arguments[1].column.get(), arguments[1].name);
+
+            if (timezone.empty())
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Function {} supports a 2nd argument (optional) that must be a valid time zone",
+                    getName());
+        }
+
+        return std::make_shared<DataTypeDateTime64>(DATETIME_SCALE, timezone);
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    {
+        const ColumnWithTypeAndName & col_type_name = arguments[0];
+        const ColumnPtr & column = col_type_name.column;
+
+        if (const auto * col_in = checkAndGetColumn<ColumnUUID>(column.get()))
+        {
+            const auto & vec_in = col_in->getData();
+            const UUID * uuids = vec_in.data();
+            const size_t size = vec_in.size();
+
+            auto col_res = ColumnDateTime64::create(size, DATETIME_SCALE);
+            auto & vec_res = col_res->getData();
+
+            for (size_t i = 0; i < size; ++i)
+            {
+                uint64_t hiBytes = DB::UUIDHelpers::getHighBytes(uuids[i]);
+                if ((hiBytes & 0xf000) == 0x7000)
+                {
+                    uint64_t ms = hiBytes >> 16;
+                    vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(
+                        ms / intExp10(DATETIME_SCALE), ms % intExp10(DATETIME_SCALE), DATETIME_SCALE);
+                }
+            }
+
+            return col_res;
+        }
+        else
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
+    }
+};
+
 REGISTER_FUNCTION(CodingUUID)
 {
     factory.registerFunction<FunctionUUIDNumToString>();
     factory.registerFunction<FunctionUUIDStringToNum>();
+    factory.registerFunction<FunctionUUIDToNum>(
+        FunctionDocumentation{
+            .description = R"(
+This function accepts a UUID and returns a FixedString(16) as its binary representation, with its format optionally specified by variant (Big-endian by default).
+)",
+            .examples{
+                {"uuid",
+                 "select toUUID(UUIDNumToString(toFixedString('a/<@];!~p{jTj={)', 16))) as uuid, UUIDToNum(uuid) as uuidNum, "
+                 "UUIDToNum(uuid, 2) as uuidMsNum",
+                 R"(
+┌─uuid─────────────────────────────────┬─uuidNum──────────┬─uuidMsNum────────┐
+│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │ @</a];!~p{jTj={) │
+└──────────────────────────────────────┴──────────────────┴──────────────────┘
+)"}},
+            .categories{"UUID"}},
+        FunctionFactory::CaseSensitive);
+
+    factory.registerFunction<FunctionUUIDv7ToDateTime>(
+        FunctionDocumentation{
+            .description = R"(
+This function extracts the timestamp from a UUID and returns it as a DateTime64(3) typed value.
+The function expects the UUID having version 7 to be provided as the first argument.
+An optional second argument can be passed to specify a timezone for the timestamp.
+)",
+            .examples{
+                {"uuid","select UUIDv7ToDateTime(generateUUIDv7())", ""},
+                {"uuid","select generateUUIDv7() as uuid, UUIDv7ToDateTime(uuid), UUIDv7ToDateTime(uuid, 'America/New_York')", ""}},
+            .categories{"UUID"}},
+        FunctionFactory::CaseSensitive);
 }
 
 }

From 4e4e72ead8436ec7e352be043f185412730dfdc8 Mon Sep 17 00:00:00 2001
From: pet74alex <167422282+pet74alex@users.noreply.github.com>
Date: Mon, 22 Apr 2024 19:30:25 +0300
Subject: [PATCH 035/192] Update English version of uuid-functions.md

---
 .../sql-reference/functions/uuid-functions.md | 251 ++++++++++++++++++
 1 file changed, 251 insertions(+)

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index c338add3a57..5f1246fc3ad 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -51,6 +51,169 @@ SELECT generateUUIDv4(1), generateUUIDv4(2)
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
+## generateUUIDv7
+
+Generates the [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). The generated UUID contains current timestamp in milliseconds followed by version 7 and variant 2 markers and random data in the following bit layout.
+```
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |       rand_a          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                        rand_b                             |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+```
+
+**Syntax**
+
+``` sql
+generateUUIDv7([x])
+```
+
+**Arguments**
+
+- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
+
+**Returned value**
+
+The UUID type value.
+
+**Usage example**
+
+This example demonstrates creating a table with the UUID type column and inserting a UUIDv7 value into the table.
+
+``` sql
+CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+
+INSERT INTO t_uuid SELECT generateUUIDv7()
+
+SELECT * FROM t_uuid
+```
+
+```response
+┌────────────────────────────────────x─┐
+│ 018f05af-f4a8-778f-beee-1bedbc95c93b │
+└──────────────────────────────────────┘
+```
+
+**Usage example if it is needed to generate multiple values in one row**
+
+```sql
+SELECT generateUUIDv7(1), generateUUIDv7(2)
+┌─generateUUIDv7(1)────────────────────┬─generateUUIDv7(2)────────────────────┐
+│ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
+## generateUUIDv7WithCounter
+
+Generates the [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
+The generated UUID contains current timestamp in milliseconds followed by version 7 and variant 2 markers, counter and random data in the following bit layout. At any given new timestamp in unix_ts_ms the counter starts from some random value and then it's being increased by 1 on each new UUID v7 with counter generation until current timestamp changes. The counter overflow causes unix_ts_ms field increment by 1 and the counter restart from a random value. Counter increment monotony at one timestamp is guaranteed across all `generateUUIDv7WithCounter` functions running simultaneously.
+```
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |   counter_high_bits   |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                   counter_low_bits                        |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+```
+
+**Syntax**
+
+``` sql
+generateUUIDv7WithCounter([x])
+```
+
+**Arguments**
+
+- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
+
+**Returned value**
+
+The UUID type value.
+
+**Usage example**
+
+This example demonstrates creating a table with the UUID type column and inserting a UUIDv7 value into the table.
+
+``` sql
+CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+
+INSERT INTO t_uuid SELECT generateUUIDv7WithCounter()
+
+SELECT * FROM t_uuid
+```
+
+```response
+┌────────────────────────────────────x─┐
+│ 018f05c7-56e3-7ac3-93e9-1d93c4218e0e │
+└──────────────────────────────────────┘
+```
+
+**Usage example if it is needed to generate multiple values in one row**
+
+```sql
+SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2)
+┌─generateUUIDv7WithCounter(1)─────────┬─generateUUIDv7WithCounter(2)─────────┐
+│ 018f05c9-4ab8-7b86-b64e-c9f03fbd45d1 │ 018f05c9-4ab8-7b86-b64e-c9f12efb7e16 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
+## generateUUIDv7WithFastCounter
+
+Generates the [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
+This function is a faster version of `generateUUIDv7WithCounter` function giving no guarantee on counter monotony across different requests running simultaneously. Counter increment monotony at one timestamp is guaranteed only within one thread calling this function to generate many UUIDs.
+
+**Syntax**
+
+``` sql
+generateUUIDv7WithFastCounter([x])
+```
+
+**Arguments**
+
+- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
+
+**Returned value**
+
+The UUID type value.
+
+**Usage example**
+
+This example demonstrates creating a table with the UUID type column and inserting a UUIDv7 value into the table.
+
+``` sql
+CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+
+INSERT INTO t_uuid SELECT generateUUIDv7WithFastCounter()
+
+SELECT * FROM t_uuid
+```
+
+```response
+┌────────────────────────────────────x─┐
+│ 018f05e2-e3b2-70cb-b8be-64b09b626d32 │
+└──────────────────────────────────────┘
+```
+
+**Usage example if it is needed to generate multiple values in one row**
+
+```sql
+SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(2)
+┌─generateUUIDv7WithFastCounter(1)─────┬─generateUUIDv7WithFastCounter(2)─────┐
+│ 018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
 ## empty
 
 Checks whether the input UUID is empty.
@@ -341,6 +504,94 @@ SELECT
 └──────────────────┴──────────────────────────────────────┘
 ```
 
+## UUIDToNum
+
+Accepts `UUID` and returns a [FixedString(16)](../../sql-reference/data-types/fixedstring.md) as its binary representation, with its format optionally specified by `variant` (`Big-endian` by default). This function replaces calls to two separate functions `UUIDStringToNum(toString(uuid))` so intermediate conversion from UUID to string is not required to extract bytes from a UUID.
+
+**Syntax**
+
+``` sql
+UUIDToNum(uuid[, variant = 1])
+```
+
+**Arguments**
+
+- `uuid` — [UUID](../data-types/uuid.md).
+- `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`.
+
+**Returned value**
+
+FixedString(16)
+
+**Usage examples**
+
+``` sql
+SELECT
+    toUUID('612f3c40-5d3b-217e-707b-6a546a3d7b29') AS uuid,
+    UUIDToNum(uuid) AS bytes
+```
+
+```response
+┌─uuid─────────────────────────────────┬─bytes────────────┐
+│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │
+└──────────────────────────────────────┴──────────────────┘
+```
+
+``` sql
+SELECT
+    toUUID('612f3c40-5d3b-217e-707b-6a546a3d7b29') AS uuid,
+    UUIDToNum(uuid, 2) AS bytes
+```
+
+```response
+┌─uuid─────────────────────────────────┬─bytes────────────┐
+│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ @</a;]~!p{jTj={) │
+└──────────────────────────────────────┴──────────────────┘
+```
+
+## UUIDv7ToDateTime
+
+Accepts `UUID` having version 7 and extracts the timestamp from it. 
+
+**Syntax**
+
+``` sql
+UUIDv7ToDateTime(uuid[, timezone])
+```
+
+**Arguments**
+
+- `uuid` — [UUID](../data-types/uuid.md) of version 7.
+- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+- Timestamp with milliseconds precision (1970-01-01 00:00:00.000 in case of non version 7 UUID).
+
+Type: [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md).
+
+**Usage examples**
+
+``` sql
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))
+```
+
+```response
+┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))─┐
+│                                          2024-04-22 15:30:29.048 │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+``` sql
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
+```
+
+```response
+┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')─┐
+│                                                              2024-04-22 08:30:29.048 │
+└──────────────────────────────────────────────────────────────────────────────────────┘
+```
+
 ## serverUUID()
 
 Returns the random and unique UUID, which is generated when the server is first started and stored forever. The result writes to the file `uuid` created in the ClickHouse server directory `/var/lib/clickhouse/`. 

From c053d5e58ee10fd65429949c38af15c7ba57bb73 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Mon, 22 Apr 2024 20:01:27 +0300
Subject: [PATCH 036/192] Small fix in generateUUIDv7WithFastCounter
 documentation

---
 src/Functions/generateUUIDv7.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 4bd540d28db..1a7f358f263 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -360,7 +360,7 @@ The function returns a value of type UUID.
             .description = R"(
 Generates a UUID of version 7 with current Unix time having milliseconds precision, a monotonic counter within the same timestamp and the same request starting from the random value, and followed by 4 random bytes.
 This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
-This function is a little bit faster version of the function GenerateUUIDv7WithCounter. It doesn't guarantee the counter monotony withing the same timestamp accross different requests. It means that two UUIDs having 
+This function is a little bit faster version of the function GenerateUUIDv7WithCounter. It doesn't guarantee the counter monotony withing the same timestamp accross different requests.
 The function returns a value of type UUID.
 )",
             .examples{

From 447aa5bf6926a8bf6442727d9bb2989340cda342 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 23 Apr 2024 12:32:08 +0300
Subject: [PATCH 037/192] Mistypes fixes in generateUUIDv7.cpp

---
 src/Functions/generateUUIDv7.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 1a7f358f263..9354e75d4f3 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -165,7 +165,7 @@ DECLARE_SEVERAL_IMPLEMENTATIONS(
             static constexpr auto name = "generateUUIDv7WithCounter";
             struct Data : std::lock_guard<std::mutex>, CounterDataCommon
             {
-                // Implement counter monotony whithin one timestamp accross all threads generating UUIDv7 with counter simultaneously
+                // Implement counter monotony within one timestamp across all threads generating UUIDv7 with counter simultaneously
                 static inline UUIDAsArray uuid_data;
                 static inline std::mutex mtx;
                 Data() : std::lock_guard<std::mutex>(mtx), CounterDataCommon(uuid_data) { }
@@ -360,7 +360,7 @@ The function returns a value of type UUID.
             .description = R"(
 Generates a UUID of version 7 with current Unix time having milliseconds precision, a monotonic counter within the same timestamp and the same request starting from the random value, and followed by 4 random bytes.
 This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
-This function is a little bit faster version of the function GenerateUUIDv7WithCounter. It doesn't guarantee the counter monotony withing the same timestamp accross different requests.
+This function is a little bit faster version of the function GenerateUUIDv7WithCounter. It doesn't guarantee the counter monotony within the same timestamp across different requests.
 The function returns a value of type UUID.
 )",
             .examples{

From e9f80b8631341b57a8f3f72e7b2ff5de2fa810b7 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 23 Apr 2024 12:36:07 +0300
Subject: [PATCH 038/192] Update aspell-dict.txt

---
 utils/check-style/aspell-ignore/en/aspell-dict.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index e63a7608210..811bf3f8e9c 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -1008,7 +1008,9 @@ URLPathHierarchy
 USearch
 UUIDNumToString
 UUIDStringToNum
+UUIDToNum
 UUIDs
+UUIDv
 UUid
 Uber
 Uint
@@ -1043,6 +1045,8 @@ Wether
 WikiStat
 WindowView
 Winkler
+WithCounter
+WithFastCounter
 WithNames
 WithNamesAndTypes
 WordNet

From 2ba6be6d8b88a1f6d876e9ea86a88a49fa5f9a3d Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 23 Apr 2024 12:39:29 +0300
Subject: [PATCH 039/192] Small style fix in generateUUIDv7.cpp

---
 src/Functions/generateUUIDv7.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 9354e75d4f3..e6f78543c0d 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -46,7 +46,8 @@ inline void fillTimestamp(UUIDAsArray & uuid, uint64_t timestamp)
 
 DECLARE_SEVERAL_IMPLEMENTATIONS(
 
-    namespace UUIDv7Impl {
+    namespace UUIDv7Impl
+    {
         inline void store(UUID & new_uuid, UUIDAsArray & uuid)
         {
             uuid[6] = (uuid[6] & 0x0f) | 0x70; // version 7

From 9c744e50ec6494a14f24004a58ba0d67686388e4 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 23 Apr 2024 12:55:17 +0300
Subject: [PATCH 040/192] Update generateUUIDv7.cpp for style check test

---
 src/Functions/generateUUIDv7.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index e6f78543c0d..17619caffec 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -172,8 +172,7 @@ DECLARE_SEVERAL_IMPLEMENTATIONS(
                 Data() : std::lock_guard<std::mutex>(mtx), CounterDataCommon(uuid_data) { }
             };
         };
-    } // namespace UUIDv7Impl
-
+    }
 
     template <typename FillPolicy>
     class FunctionGenerateUUIDv7Base

From 35d700a5af99a8046faa4f75993973291d92d0c9 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 23 Apr 2024 14:22:58 +0300
Subject: [PATCH 041/192] Update generateUUIDv7.cpp small fixes for clang-tidy
 checks

---
 src/Functions/generateUUIDv7.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 17619caffec..101fe4a279d 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -60,7 +60,7 @@ DECLARE_SEVERAL_IMPLEMENTATIONS(
         struct UUIDv7Base
         {
             UUIDAsArray & uuid;
-            UUIDv7Base(UUIDAsArray & u) : uuid(u) { }
+            explicit UUIDv7Base(UUIDAsArray & u) : uuid(u) { }
         };
 
         struct RandomData
@@ -83,7 +83,7 @@ DECLARE_SEVERAL_IMPLEMENTATIONS(
 
         struct CounterDataCommon : UUIDv7Base
         {
-            CounterDataCommon(UUIDAsArray & u) : UUIDv7Base(u) { }
+            explicit CounterDataCommon(UUIDAsArray & u) : UUIDv7Base(u) { }
 
             uint64_t getCounter()
             {

From f0faac2e8bcef83289e858e4cbfe374f68cf2ce4 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 23 Apr 2024 14:02:26 +0000
Subject: [PATCH 042/192] Move MergeTree*BloomFilter into the same
 header/source file

---
 .../MergeTreeIndexAggregatorBloomFilter.cpp   |  65 --
 .../MergeTreeIndexAggregatorBloomFilter.h     |  30 -
 .../MergeTree/MergeTreeIndexBloomFilter.cpp   | 866 +++++++++++++++++-
 .../MergeTree/MergeTreeIndexBloomFilter.h     | 121 ++-
 .../MergeTreeIndexConditionBloomFilter.cpp    | 729 ---------------
 .../MergeTreeIndexConditionBloomFilter.h      |  87 --
 .../MergeTreeIndexGranuleBloomFilter.cpp      | 102 ---
 .../MergeTreeIndexGranuleBloomFilter.h        |  35 -
 8 files changed, 979 insertions(+), 1056 deletions(-)
 delete mode 100644 src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp
 delete mode 100644 src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h
 delete mode 100644 src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp
 delete mode 100644 src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h
 delete mode 100644 src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp
 delete mode 100644 src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h

diff --git a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp
deleted file mode 100644
index c69c54f1c0d..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h>
-
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnsNumber.h>
-#include <Columns/ColumnFixedString.h>
-#include <Common/HashTable/Hash.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <Interpreters/BloomFilterHash.h>
-#include <IO/WriteHelpers.h>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
-MergeTreeIndexAggregatorBloomFilter::MergeTreeIndexAggregatorBloomFilter(
-    size_t bits_per_row_, size_t hash_functions_, const Names & columns_name_)
-    : bits_per_row(bits_per_row_), hash_functions(hash_functions_), index_columns_name(columns_name_), column_hashes(columns_name_.size())
-{
-    assert(bits_per_row != 0);
-    assert(hash_functions != 0);
-}
-
-bool MergeTreeIndexAggregatorBloomFilter::empty() const
-{
-    return !total_rows;
-}
-
-MergeTreeIndexGranulePtr MergeTreeIndexAggregatorBloomFilter::getGranuleAndReset()
-{
-    const auto granule = std::make_shared<MergeTreeIndexGranuleBloomFilter>(bits_per_row, hash_functions, column_hashes);
-    total_rows = 0;
-    column_hashes.clear();
-    return granule;
-}
-
-void MergeTreeIndexAggregatorBloomFilter::update(const Block & block, size_t * pos, size_t limit)
-{
-    if (*pos >= block.rows())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. "
-                        "Position: {}, Block rows: {}.", *pos, block.rows());
-
-    Block granule_index_block;
-    size_t max_read_rows = std::min(block.rows() - *pos, limit);
-
-    for (size_t column = 0; column < index_columns_name.size(); ++column)
-    {
-        const auto & column_and_type = block.getByName(index_columns_name[column]);
-        auto index_column = BloomFilterHash::hashWithColumn(column_and_type.type, column_and_type.column, *pos, max_read_rows);
-
-        const auto & index_col = checkAndGetColumn<ColumnUInt64>(index_column.get());
-        const auto & index_data = index_col->getData();
-        for (const auto & hash: index_data)
-            column_hashes[column].insert(hash);
-    }
-
-    *pos += max_read_rows;
-    total_rows += max_read_rows;
-}
-
-}
diff --git a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h
deleted file mode 100644
index d20653b7689..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <Storages/MergeTree/MergeTreeIndices.h>
-#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
-#include <Common/HashTable/HashSet.h>
-
-namespace DB
-{
-
-class MergeTreeIndexAggregatorBloomFilter final : public IMergeTreeIndexAggregator
-{
-public:
-    MergeTreeIndexAggregatorBloomFilter(size_t bits_per_row_, size_t hash_functions_, const Names & columns_name_);
-
-    bool empty() const override;
-
-    MergeTreeIndexGranulePtr getGranuleAndReset() override;
-
-    void update(const Block & block, size_t * pos, size_t limit) override;
-
-private:
-    size_t bits_per_row;
-    size_t hash_functions;
-    const Names index_columns_name;
-
-    std::vector<HashSet<UInt64>> column_hashes;
-    size_t total_rows = 0;
-};
-
-}
diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
index dbd33609a00..ed091022a91 100644
--- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
@@ -1,13 +1,36 @@
 #include <Storages/MergeTree/MergeTreeIndexBloomFilter.h>
-#include <Storages/MergeTree/MergeTreeData.h>
-#include <Interpreters/TreeRewriter.h>
-#include <Interpreters/ExpressionAnalyzer.h>
-#include <base/types.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h>
+
+#include <Columns/ColumnArray.h>
 #include <Columns/ColumnConst.h>
+#include <Columns/ColumnFixedString.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnsNumber.h>
+#include <Common/FieldVisitorsAccurateComparison.h>
+#include <Common/HashTable/ClearableHashMap.h>
+#include <Common/HashTable/Hash.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeMap.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/WriteHelpers.h>
 #include <Interpreters/BloomFilterHash.h>
+#include <Interpreters/ExpressionAnalyzer.h>
+#include <Interpreters/TreeRewriter.h>
+#include <Interpreters/castColumn.h>
+#include <Interpreters/convertFieldToType.h>
+#include <Interpreters/misc.h>
 #include <Parsers/ASTFunction.h>
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTSelectQuery.h>
+#include <Parsers/ASTSubquery.h>
+#include <Storages/MergeTree/MergeTreeData.h>
+#include <Storages/MergeTree/MergeTreeIndexUtils.h>
+#include <Storages/MergeTree/RPNBuilder.h>
+#include <base/types.h>
 
 
 namespace DB
@@ -17,8 +40,839 @@ namespace ErrorCodes
 {
     extern const int BAD_ARGUMENTS;
     extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
     extern const int INCORRECT_QUERY;
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int LOGICAL_ERROR;
+}
+
+MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_)
+    : bits_per_row(bits_per_row_), hash_functions(hash_functions_), bloom_filters(index_columns_)
+{
+    total_rows = 0;
+    for (size_t column = 0; column < index_columns_; ++column)
+        bloom_filters[column] = std::make_shared<BloomFilter>(bits_per_row, hash_functions, 0);
+}
+
+MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(
+    size_t bits_per_row_, size_t hash_functions_, const std::vector<HashSet<UInt64>>& column_hashes_)
+        : bits_per_row(bits_per_row_), hash_functions(hash_functions_), bloom_filters(column_hashes_.size())
+{
+    if (column_hashes_.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule_index_blocks empty or total_rows is zero.");
+
+    size_t bloom_filter_max_size = 0;
+    for (const auto & column_hash : column_hashes_)
+        bloom_filter_max_size = std::max(bloom_filter_max_size, column_hash.size());
+
+    static size_t atom_size = 8;
+
+    // If multiple columns are given, we will initialize all the bloom filters
+    // with the size of the highest-cardinality one. This is done for compatibility with
+    // existing binary serialization format
+    total_rows = bloom_filter_max_size;
+    size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
+
+    for (size_t column = 0, columns = column_hashes_.size(); column < columns; ++column)
+    {
+        bloom_filters[column] = std::make_shared<BloomFilter>(bytes_size, hash_functions, 0);
+        fillingBloomFilter(bloom_filters[column], column_hashes_[column]);
+    }
+}
+
+bool MergeTreeIndexGranuleBloomFilter::empty() const
+{
+    return !total_rows;
+}
+
+void MergeTreeIndexGranuleBloomFilter::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
+{
+    if (version != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version);
+
+    readVarUInt(total_rows, istr);
+
+    static size_t atom_size = 8;
+    size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
+    size_t read_size = bytes_size;
+    for (auto & filter : bloom_filters)
+    {
+        filter->resize(bytes_size);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        read_size = filter->getFilter().size() * sizeof(BloomFilter::UnderType);
+#endif
+        istr.readStrict(reinterpret_cast<char *>(filter->getFilter().data()), read_size);
+    }
+}
+
+void MergeTreeIndexGranuleBloomFilter::serializeBinary(WriteBuffer & ostr) const
+{
+    if (empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty bloom filter index.");
+
+    writeVarUInt(total_rows, ostr);
+
+    static size_t atom_size = 8;
+    size_t write_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
+    for (const auto & bloom_filter : bloom_filters)
+    {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+        write_size = bloom_filter->getFilter().size() * sizeof(BloomFilter::UnderType);
+#endif
+        ostr.write(reinterpret_cast<const char *>(bloom_filter->getFilter().data()), write_size);
+    }
+}
+
+void MergeTreeIndexGranuleBloomFilter::fillingBloomFilter(BloomFilterPtr & bf, const HashSet<UInt64> &hashes) const
+{
+    for (const auto & bf_base_hash : hashes)
+        for (size_t i = 0; i < hash_functions; ++i)
+            bf->addHashWithSeed(bf_base_hash.getKey(), BloomFilterHash::bf_hash_seed[i]);
+}
+
+namespace
+{
+
+ColumnWithTypeAndName getPreparedSetInfo(const ConstSetPtr & prepared_set)
+{
+    if (prepared_set->getDataTypes().size() == 1)
+        return {prepared_set->getSetElements()[0], prepared_set->getElementsTypes()[0], "dummy"};
+
+    Columns set_elements;
+    for (auto & set_element : prepared_set->getSetElements())
+
+        set_elements.emplace_back(set_element->convertToFullColumnIfConst());
+
+    return {ColumnTuple::create(set_elements), std::make_shared<DataTypeTuple>(prepared_set->getElementsTypes()), "dummy"};
+}
+
+bool hashMatchesFilter(const BloomFilterPtr& bloom_filter, UInt64 hash, size_t hash_functions)
+{
+    return std::all_of(BloomFilterHash::bf_hash_seed,
+                       BloomFilterHash::bf_hash_seed + hash_functions,
+                       [&](const auto &hash_seed)
+                       {
+                           return bloom_filter->findHashWithSeed(hash,
+                                                                 hash_seed);
+                       });
+}
+
+bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr & bloom_filter, size_t hash_functions, bool match_all)
+{
+    const auto * const_column = typeid_cast<const ColumnConst *>(hash_column);
+    const auto * non_const_column = typeid_cast<const ColumnUInt64 *>(hash_column);
+
+    if (!const_column && !non_const_column)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Hash column must be Const or UInt64.");
+
+    if (const_column)
+    {
+        return hashMatchesFilter(bloom_filter,
+                                 const_column->getValue<UInt64>(),
+                                 hash_functions);
+    }
+
+    const ColumnUInt64::Container & hashes = non_const_column->getData();
+
+    if (match_all)
+    {
+        return std::all_of(hashes.begin(),
+                           hashes.end(),
+                           [&](const auto& hash_row)
+                           {
+                               return hashMatchesFilter(bloom_filter,
+                                                        hash_row,
+                                                        hash_functions);
+                           });
+    }
+    else
+    {
+        return std::any_of(hashes.begin(),
+                           hashes.end(),
+                           [&](const auto& hash_row)
+                           {
+                               return hashMatchesFilter(bloom_filter,
+                                                        hash_row,
+                                                        hash_functions);
+                           });
+    }
+}
+
+}
+
+MergeTreeIndexConditionBloomFilter::MergeTreeIndexConditionBloomFilter(
+    const ActionsDAGPtr & filter_actions_dag, ContextPtr context_, const Block & header_, size_t hash_functions_)
+    : WithContext(context_), header(header_), hash_functions(hash_functions_)
+{
+    if (!filter_actions_dag)
+    {
+        rpn.push_back(RPNElement::FUNCTION_UNKNOWN);
+        return;
+    }
+
+    RPNBuilder<RPNElement> builder(
+        filter_actions_dag->getOutputs().at(0),
+        context_,
+        [&](const RPNBuilderTreeNode & node, RPNElement & out) { return extractAtomFromTree(node, out); });
+    rpn = std::move(builder).extractRPN();
+}
+
+bool MergeTreeIndexConditionBloomFilter::alwaysUnknownOrTrue() const
+{
+    std::vector<bool> rpn_stack;
+
+    for (const auto & element : rpn)
+    {
+        if (element.function == RPNElement::FUNCTION_UNKNOWN
+            || element.function == RPNElement::ALWAYS_TRUE)
+        {
+            rpn_stack.push_back(true);
+        }
+        else if (element.function == RPNElement::FUNCTION_EQUALS
+            || element.function == RPNElement::FUNCTION_NOT_EQUALS
+            || element.function == RPNElement::FUNCTION_HAS
+            || element.function == RPNElement::FUNCTION_HAS_ANY
+            || element.function == RPNElement::FUNCTION_HAS_ALL
+            || element.function == RPNElement::FUNCTION_IN
+            || element.function == RPNElement::FUNCTION_NOT_IN
+            || element.function == RPNElement::ALWAYS_FALSE)
+        {
+            rpn_stack.push_back(false);
+        }
+        else if (element.function == RPNElement::FUNCTION_NOT)
+        {
+            // do nothing
+        }
+        else if (element.function == RPNElement::FUNCTION_AND)
+        {
+            auto arg1 = rpn_stack.back();
+            rpn_stack.pop_back();
+            auto arg2 = rpn_stack.back();
+            rpn_stack.back() = arg1 && arg2;
+        }
+        else if (element.function == RPNElement::FUNCTION_OR)
+        {
+            auto arg1 = rpn_stack.back();
+            rpn_stack.pop_back();
+            auto arg2 = rpn_stack.back();
+            rpn_stack.back() = arg1 || arg2;
+        }
+        else
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected function type in KeyCondition::RPNElement");
+    }
+
+    return rpn_stack[0];
+}
+
+bool MergeTreeIndexConditionBloomFilter::mayBeTrueOnGranule(const MergeTreeIndexGranuleBloomFilter * granule) const
+{
+    std::vector<BoolMask> rpn_stack;
+    const auto & filters = granule->getFilters();
+
+    for (const auto & element : rpn)
+    {
+        if (element.function == RPNElement::FUNCTION_UNKNOWN)
+        {
+            rpn_stack.emplace_back(true, true);
+        }
+        else if (element.function == RPNElement::FUNCTION_IN
+            || element.function == RPNElement::FUNCTION_NOT_IN
+            || element.function == RPNElement::FUNCTION_EQUALS
+            || element.function == RPNElement::FUNCTION_NOT_EQUALS
+            || element.function == RPNElement::FUNCTION_HAS
+            || element.function == RPNElement::FUNCTION_HAS_ANY
+            || element.function == RPNElement::FUNCTION_HAS_ALL)
+        {
+            bool match_rows = true;
+            bool match_all = element.function == RPNElement::FUNCTION_HAS_ALL;
+            const auto & predicate = element.predicate;
+            for (size_t index = 0; match_rows && index < predicate.size(); ++index)
+            {
+                const auto & query_index_hash = predicate[index];
+                const auto & filter = filters[query_index_hash.first];
+                const ColumnPtr & hash_column = query_index_hash.second;
+
+                match_rows = maybeTrueOnBloomFilter(&*hash_column,
+                                                    filter,
+                                                    hash_functions,
+                                                    match_all);
+            }
+
+            rpn_stack.emplace_back(match_rows, true);
+            if (element.function == RPNElement::FUNCTION_NOT_EQUALS || element.function == RPNElement::FUNCTION_NOT_IN)
+                rpn_stack.back() = !rpn_stack.back();
+        }
+        else if (element.function == RPNElement::FUNCTION_NOT)
+        {
+            rpn_stack.back() = !rpn_stack.back();
+        }
+        else if (element.function == RPNElement::FUNCTION_OR)
+        {
+            auto arg1 = rpn_stack.back();
+            rpn_stack.pop_back();
+            auto arg2 = rpn_stack.back();
+            rpn_stack.back() = arg1 | arg2;
+        }
+        else if (element.function == RPNElement::FUNCTION_AND)
+        {
+            auto arg1 = rpn_stack.back();
+            rpn_stack.pop_back();
+            auto arg2 = rpn_stack.back();
+            rpn_stack.back() = arg1 & arg2;
+        }
+        else if (element.function == RPNElement::ALWAYS_TRUE)
+        {
+            rpn_stack.emplace_back(true, false);
+        }
+        else if (element.function == RPNElement::ALWAYS_FALSE)
+        {
+            rpn_stack.emplace_back(false, true);
+        }
+        else
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected function type in KeyCondition::RPNElement");
+    }
+
+    if (rpn_stack.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::mayBeTrueInRange");
+
+    return rpn_stack[0].can_be_true;
+}
+
+bool MergeTreeIndexConditionBloomFilter::extractAtomFromTree(const RPNBuilderTreeNode & node, RPNElement & out)
+{
+    {
+        Field const_value;
+        DataTypePtr const_type;
+
+        if (node.tryGetConstant(const_value, const_type))
+        {
+            if (const_value.getType() == Field::Types::UInt64)
+            {
+                out.function = const_value.get<UInt64>() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
+                return true;
+            }
+
+            if (const_value.getType() == Field::Types::Int64)
+            {
+                out.function = const_value.get<Int64>() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
+                return true;
+            }
+
+            if (const_value.getType() == Field::Types::Float64)
+            {
+                out.function = const_value.get<Float64>() != 0.0 ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
+                return true;
+            }
+        }
+    }
+
+    return traverseFunction(node, out, nullptr /*parent*/);
+}
+
+bool MergeTreeIndexConditionBloomFilter::traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent)
+{
+    bool maybe_useful = false;
+
+    if (node.isFunction())
+    {
+        const auto function = node.toFunctionNode();
+        auto arguments_size = function.getArgumentsSize();
+        auto function_name = function.getFunctionName();
+
+        for (size_t i = 0; i < arguments_size; ++i)
+        {
+            auto argument = function.getArgumentAt(i);
+            if (traverseFunction(argument, out, &node))
+                maybe_useful = true;
+        }
+
+        if (arguments_size != 2)
+            return false;
+
+        auto lhs_argument = function.getArgumentAt(0);
+        auto rhs_argument = function.getArgumentAt(1);
+
+        if (functionIsInOrGlobalInOperator(function_name))
+        {
+            if (auto future_set = rhs_argument.tryGetPreparedSet(); future_set)
+            {
+                if (auto prepared_set = future_set->buildOrderedSetInplace(rhs_argument.getTreeContext().getQueryContext()); prepared_set)
+                {
+                    if (prepared_set->hasExplicitSetElements())
+                    {
+                        const auto prepared_info = getPreparedSetInfo(prepared_set);
+                        if (traverseTreeIn(function_name, lhs_argument, prepared_set, prepared_info.type, prepared_info.column, out))
+                            maybe_useful = true;
+                    }
+                }
+            }
+        }
+        else if (function_name == "equals" ||
+                 function_name == "notEquals" ||
+                 function_name == "has" ||
+                 function_name == "mapContains" ||
+                 function_name == "indexOf" ||
+                 function_name == "hasAny" ||
+                 function_name == "hasAll")
+        {
+            Field const_value;
+            DataTypePtr const_type;
+
+            if (rhs_argument.tryGetConstant(const_value, const_type))
+            {
+                if (traverseTreeEquals(function_name, lhs_argument, const_type, const_value, out, parent))
+                    maybe_useful = true;
+            }
+            else if (lhs_argument.tryGetConstant(const_value, const_type))
+            {
+                if (traverseTreeEquals(function_name, rhs_argument, const_type, const_value, out, parent))
+                    maybe_useful = true;
+            }
+        }
+    }
+
+    return maybe_useful;
+}
+
+bool MergeTreeIndexConditionBloomFilter::traverseTreeIn(
+    const String & function_name,
+    const RPNBuilderTreeNode & key_node,
+    const ConstSetPtr & prepared_set,
+    const DataTypePtr & type,
+    const ColumnPtr & column,
+    RPNElement & out)
+{
+    auto key_node_column_name = key_node.getColumnName();
+
+    if (header.has(key_node_column_name))
+    {
+        size_t row_size = column->size();
+        size_t position = header.getPositionByName(key_node_column_name);
+        const DataTypePtr & index_type = header.getByPosition(position).type;
+        const auto & converted_column = castColumn(ColumnWithTypeAndName{column, type, ""}, index_type);
+        out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(index_type, converted_column, 0, row_size)));
+
+        if (function_name == "in"  || function_name == "globalIn")
+            out.function = RPNElement::FUNCTION_IN;
+
+        if (function_name == "notIn"  || function_name == "globalNotIn")
+            out.function = RPNElement::FUNCTION_NOT_IN;
+
+        return true;
+    }
+
+    if (key_node.isFunction())
+    {
+        auto key_node_function = key_node.toFunctionNode();
+        auto key_node_function_name = key_node_function.getFunctionName();
+        size_t key_node_function_arguments_size = key_node_function.getArgumentsSize();
+
+        WhichDataType which(type);
+
+        if (which.isTuple() && key_node_function_name == "tuple")
+        {
+            const auto & tuple_column = typeid_cast<const ColumnTuple *>(column.get());
+            const auto & tuple_data_type = typeid_cast<const DataTypeTuple *>(type.get());
+
+            if (tuple_data_type->getElements().size() != key_node_function_arguments_size || tuple_column->getColumns().size() != key_node_function_arguments_size)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal types of arguments of function {}", function_name);
+
+            bool match_with_subtype = false;
+            const auto & sub_columns = tuple_column->getColumns();
+            const auto & sub_data_types = tuple_data_type->getElements();
+
+            for (size_t index = 0; index < key_node_function_arguments_size; ++index)
+                match_with_subtype |= traverseTreeIn(function_name, key_node_function.getArgumentAt(index), nullptr, sub_data_types[index], sub_columns[index], out);
+
+            return match_with_subtype;
+        }
+
+        if (key_node_function_name == "arrayElement")
+        {
+            /** Try to parse arrayElement for mapKeys index.
+              * It is important to ignore keys like column_map['Key'] IN ('') because if key does not exists in map
+              * we return default value for arrayElement.
+              *
+              * We cannot skip keys that does not exist in map if comparison is with default type value because
+              * that way we skip necessary granules where map key does not exists.
+              */
+            if (!prepared_set)
+                return false;
+
+            auto default_column_to_check = type->createColumnConstWithDefaultValue(1)->convertToFullColumnIfConst();
+            ColumnWithTypeAndName default_column_with_type_to_check { default_column_to_check, type, "" };
+            ColumnsWithTypeAndName default_columns_with_type_to_check = {default_column_with_type_to_check};
+            auto set_contains_default_value_predicate_column = prepared_set->execute(default_columns_with_type_to_check, false /*negative*/);
+            const auto & set_contains_default_value_predicate_column_typed = assert_cast<const ColumnUInt8 &>(*set_contains_default_value_predicate_column);
+            bool set_contain_default_value = set_contains_default_value_predicate_column_typed.getData()[0];
+            if (set_contain_default_value)
+                return false;
+
+            auto first_argument = key_node_function.getArgumentAt(0);
+            const auto column_name = first_argument.getColumnName();
+            auto map_keys_index_column_name = fmt::format("mapKeys({})", column_name);
+            auto map_values_index_column_name = fmt::format("mapValues({})", column_name);
+
+            if (header.has(map_keys_index_column_name))
+            {
+                /// For mapKeys we serialize key argument with bloom filter
+
+                auto second_argument = key_node_function.getArgumentAt(1);
+
+                Field constant_value;
+                DataTypePtr constant_type;
+
+                if (second_argument.tryGetConstant(constant_value, constant_type))
+                {
+                    size_t position = header.getPositionByName(map_keys_index_column_name);
+                    const DataTypePtr & index_type = header.getByPosition(position).type;
+                    const DataTypePtr actual_type = BloomFilter::getPrimitiveType(index_type);
+                    out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), constant_value)));
+                }
+                else
+                {
+                    return false;
+                }
+            }
+            else if (header.has(map_values_index_column_name))
+            {
+                /// For mapValues we serialize set with bloom filter
+
+                size_t row_size = column->size();
+                size_t position = header.getPositionByName(map_values_index_column_name);
+                const DataTypePtr & index_type = header.getByPosition(position).type;
+                const auto & array_type = assert_cast<const DataTypeArray &>(*index_type);
+                const auto & array_nested_type = array_type.getNestedType();
+                const auto & converted_column = castColumn(ColumnWithTypeAndName{column, type, ""}, array_nested_type);
+                out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(array_nested_type, converted_column, 0, row_size)));
+            }
+            else
+            {
+                return false;
+            }
+
+            if (function_name == "in"  || function_name == "globalIn")
+                out.function = RPNElement::FUNCTION_IN;
+
+            if (function_name == "notIn"  || function_name == "globalNotIn")
+                out.function = RPNElement::FUNCTION_NOT_IN;
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+static bool indexOfCanUseBloomFilter(const RPNBuilderTreeNode * parent)
+{
+    if (!parent)
+        return true;
+
+    if (!parent->isFunction())
+        return false;
+
+    auto function = parent->toFunctionNode();
+    auto function_name = function.getFunctionName();
+
+    /// `parent` is a function where `indexOf` is located.
+    /// Example: `indexOf(arr, x) = 1`, parent is a function named `equals`.
+    if (function_name == "and")
+    {
+        return true;
+    }
+    else if (function_name == "equals" /// notEquals is not applicable
+        || function_name == "greater" || function_name == "greaterOrEquals"
+        || function_name == "less" || function_name == "lessOrEquals")
+    {
+        size_t function_arguments_size = function.getArgumentsSize();
+        if (function_arguments_size != 2)
+            return false;
+
+        /// We don't allow constant expressions like `indexOf(arr, x) = 1 + 0` but it's negligible.
+
+        /// We should return true when the corresponding expression implies that the array contains the element.
+        /// Example: when `indexOf(arr, x)` > 10 is written, it means that arr definitely should contain the element
+        /// (at least at 11th position but it does not matter).
+
+        bool reversed = false;
+        Field constant_value;
+        DataTypePtr constant_type;
+
+        if (function.getArgumentAt(0).tryGetConstant(constant_value, constant_type))
+        {
+            reversed = true;
+        }
+        else if (function.getArgumentAt(1).tryGetConstant(constant_value, constant_type))
+        {
+        }
+        else
+        {
+            return false;
+        }
+
+        Field zero(0);
+        bool constant_equal_zero = applyVisitor(FieldVisitorAccurateEquals(), constant_value, zero);
+
+        if (function_name == "equals" && !constant_equal_zero)
+        {
+            /// indexOf(...) = c, c != 0
+            return true;
+        }
+        else if (function_name == "notEquals" && constant_equal_zero)
+        {
+            /// indexOf(...) != c, c = 0
+            return true;
+        }
+        else if (function_name == (reversed ? "less" : "greater") && !applyVisitor(FieldVisitorAccurateLess(), constant_value, zero))
+        {
+            /// indexOf(...) > c, c >= 0
+            return true;
+        }
+        else if (function_name == (reversed ? "lessOrEquals" : "greaterOrEquals") && applyVisitor(FieldVisitorAccurateLess(), zero, constant_value))
+        {
+            /// indexOf(...) >= c, c > 0
+            return true;
+        }
+
+        return false;
+    }
+
+    return false;
+}
+
+
+bool MergeTreeIndexConditionBloomFilter::traverseTreeEquals(
+    const String & function_name,
+    const RPNBuilderTreeNode & key_node,
+    const DataTypePtr & value_type,
+    const Field & value_field,
+    RPNElement & out,
+    const RPNBuilderTreeNode * parent)
+{
+    auto key_column_name = key_node.getColumnName();
+
+    if (header.has(key_column_name))
+    {
+        size_t position = header.getPositionByName(key_column_name);
+        const DataTypePtr & index_type = header.getByPosition(position).type;
+        const auto * array_type = typeid_cast<const DataTypeArray *>(index_type.get());
+
+        if (function_name == "has" || function_name == "indexOf")
+        {
+            if (!array_type)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", function_name);
+
+            /// We can treat `indexOf` function similar to `has`.
+            /// But it is little more cumbersome, compare: `has(arr, elem)` and `indexOf(arr, elem) != 0`.
+            /// The `parent` in this context is expected to be function `!=` (`notEquals`).
+            if (function_name == "has" || indexOfCanUseBloomFilter(parent))
+            {
+                out.function = RPNElement::FUNCTION_HAS;
+                const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType());
+                auto converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
+                if (converted_field.isNull())
+                    return false;
+
+                out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
+            }
+        }
+        else if (function_name == "hasAny" || function_name == "hasAll")
+        {
+            if (!array_type)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", function_name);
+
+            if (value_field.getType() != Field::Types::Array)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument for function {} must be an array.", function_name);
+
+            const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType());
+            ColumnPtr column;
+            {
+                const bool is_nullable = actual_type->isNullable();
+                auto mutable_column = actual_type->createColumn();
+
+                for (const auto & f : value_field.get<Array>())
+                {
+                    if ((f.isNull() && !is_nullable) || f.isDecimal(f.getType())) /// NOLINT(readability-static-accessed-through-instance)
+                        return false;
+
+                    auto converted = convertFieldToType(f, *actual_type);
+                    if (converted.isNull())
+                        return false;
+
+                    mutable_column->insert(converted);
+                }
+
+                column = std::move(mutable_column);
+            }
+
+            out.function = function_name == "hasAny" ?
+                RPNElement::FUNCTION_HAS_ANY :
+                RPNElement::FUNCTION_HAS_ALL;
+            out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(actual_type, column, 0, column->size())));
+        }
+        else
+        {
+            if (array_type)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                                "An array type of bloom_filter supports only has(), indexOf(), and hasAny() functions.");
+
+            out.function = function_name == "equals" ? RPNElement::FUNCTION_EQUALS : RPNElement::FUNCTION_NOT_EQUALS;
+            const DataTypePtr actual_type = BloomFilter::getPrimitiveType(index_type);
+            auto converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
+            if (converted_field.isNull())
+                return false;
+
+            out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
+        }
+
+        return true;
+    }
+
+    if (function_name == "mapContains" || function_name == "has")
+    {
+        auto map_keys_index_column_name = fmt::format("mapKeys({})", key_column_name);
+        if (!header.has(map_keys_index_column_name))
+            return false;
+
+        size_t position = header.getPositionByName(map_keys_index_column_name);
+        const DataTypePtr & index_type = header.getByPosition(position).type;
+        const auto * array_type = typeid_cast<const DataTypeArray *>(index_type.get());
+
+        if (!array_type)
+            return false;
+
+        out.function = RPNElement::FUNCTION_HAS;
+        const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType());
+        auto converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
+        if (converted_field.isNull())
+            return false;
+
+        out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
+        return true;
+    }
+
+    if (key_node.isFunction())
+    {
+        WhichDataType which(value_type);
+
+        auto key_node_function = key_node.toFunctionNode();
+        auto key_node_function_name = key_node_function.getFunctionName();
+        size_t key_node_function_arguments_size = key_node_function.getArgumentsSize();
+
+        if (which.isTuple() && key_node_function_name == "tuple")
+        {
+            const Tuple & tuple = value_field.get<const Tuple &>();
+            const auto * value_tuple_data_type = typeid_cast<const DataTypeTuple *>(value_type.get());
+
+            if (tuple.size() != key_node_function_arguments_size)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal types of arguments of function {}", function_name);
+
+            bool match_with_subtype = false;
+            const DataTypes & subtypes = value_tuple_data_type->getElements();
+
+            for (size_t index = 0; index < tuple.size(); ++index)
+                match_with_subtype |= traverseTreeEquals(function_name, key_node_function.getArgumentAt(index), subtypes[index], tuple[index], out, &key_node);
+
+            return match_with_subtype;
+        }
+
+        if (key_node_function_name == "arrayElement" && (function_name == "equals" || function_name == "notEquals"))
+        {
+            /** Try to parse arrayElement for mapKeys index.
+              * It is important to ignore keys like column_map['Key'] = '' because if key does not exists in map
+              * we return default value for arrayElement.
+              *
+              * We cannot skip keys that does not exist in map if comparison is with default type value because
+              * that way we skip necessary granules where map key does not exists.
+              */
+            if (value_field == value_type->getDefault())
+                return false;
+
+            auto first_argument = key_node_function.getArgumentAt(0);
+            const auto column_name = first_argument.getColumnName();
+
+            auto map_keys_index_column_name = fmt::format("mapKeys({})", column_name);
+            auto map_values_index_column_name = fmt::format("mapValues({})", column_name);
+
+            size_t position = 0;
+            Field const_value = value_field;
+            DataTypePtr const_type;
+
+            if (header.has(map_keys_index_column_name))
+            {
+                position = header.getPositionByName(map_keys_index_column_name);
+                auto second_argument = key_node_function.getArgumentAt(1);
+
+                if (!second_argument.tryGetConstant(const_value, const_type))
+                    return false;
+            }
+            else if (header.has(map_values_index_column_name))
+            {
+                position = header.getPositionByName(map_values_index_column_name);
+            }
+            else
+            {
+                return false;
+            }
+
+            out.function = function_name == "equals" ? RPNElement::FUNCTION_EQUALS : RPNElement::FUNCTION_NOT_EQUALS;
+
+            const auto & index_type = header.getByPosition(position).type;
+            const auto actual_type = BloomFilter::getPrimitiveType(index_type);
+            out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), const_value)));
+
+            return true;
+        }
+    }
+
+    return false;
+}
+
+MergeTreeIndexAggregatorBloomFilter::MergeTreeIndexAggregatorBloomFilter(
+    size_t bits_per_row_, size_t hash_functions_, const Names & columns_name_)
+    : bits_per_row(bits_per_row_), hash_functions(hash_functions_), index_columns_name(columns_name_), column_hashes(columns_name_.size())
+{
+    assert(bits_per_row != 0);
+    assert(hash_functions != 0);
+}
+
+bool MergeTreeIndexAggregatorBloomFilter::empty() const
+{
+    return !total_rows;
+}
+
+MergeTreeIndexGranulePtr MergeTreeIndexAggregatorBloomFilter::getGranuleAndReset()
+{
+    const auto granule = std::make_shared<MergeTreeIndexGranuleBloomFilter>(bits_per_row, hash_functions, column_hashes);
+    total_rows = 0;
+    column_hashes.clear();
+    return granule;
+}
+
+void MergeTreeIndexAggregatorBloomFilter::update(const Block & block, size_t * pos, size_t limit)
+{
+    if (*pos >= block.rows())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. "
+                        "Position: {}, Block rows: {}.", *pos, block.rows());
+
+    Block granule_index_block;
+    size_t max_read_rows = std::min(block.rows() - *pos, limit);
+
+    for (size_t column = 0; column < index_columns_name.size(); ++column)
+    {
+        const auto & column_and_type = block.getByName(index_columns_name[column]);
+        auto index_column = BloomFilterHash::hashWithColumn(column_and_type.type, column_and_type.column, *pos, max_read_rows);
+
+        const auto & index_col = checkAndGetColumn<ColumnUInt64>(index_column.get());
+        const auto & index_data = index_col->getData();
+        for (const auto & hash: index_data)
+            column_hashes[column].insert(hash);
+    }
+
+    *pos += max_read_rows;
+    total_rows += max_read_rows;
 }
 
 MergeTreeIndexBloomFilter::MergeTreeIndexBloomFilter(
diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h
index d6f4d6f2cf5..eeaa938551c 100644
--- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h
@@ -1,13 +1,130 @@
 #pragma once
 
+#include <Columns/IColumn.h>
+#include <Common/HashTable/HashSet.h>
 #include <Interpreters/BloomFilter.h>
+#include <Storages/MergeTree/KeyCondition.h>
 #include <Storages/MergeTree/MergeTreeIndices.h>
-#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
-#include <Storages/MergeTree/MergeTreeIndexAggregatorBloomFilter.h>
 
 namespace DB
 {
 
+class MergeTreeIndexGranuleBloomFilter final : public IMergeTreeIndexGranule
+{
+public:
+    MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_);
+
+    MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, const std::vector<HashSet<UInt64>> & column_hashes);
+
+    bool empty() const override;
+
+    void serializeBinary(WriteBuffer & ostr) const override;
+    void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
+
+    const std::vector<BloomFilterPtr> & getFilters() const { return bloom_filters; }
+
+private:
+    const size_t bits_per_row;
+    const size_t hash_functions;
+
+    size_t total_rows = 0;
+    std::vector<BloomFilterPtr> bloom_filters;
+
+    void fillingBloomFilter(BloomFilterPtr & bf, const HashSet<UInt64> & hashes) const;
+};
+
+class MergeTreeIndexConditionBloomFilter final : public IMergeTreeIndexCondition, WithContext
+{
+public:
+    struct RPNElement
+    {
+        enum Function
+        {
+            /// Atoms of a Boolean expression.
+            FUNCTION_EQUALS,
+            FUNCTION_NOT_EQUALS,
+            FUNCTION_HAS,
+            FUNCTION_HAS_ANY,
+            FUNCTION_HAS_ALL,
+            FUNCTION_IN,
+            FUNCTION_NOT_IN,
+            FUNCTION_UNKNOWN, /// Can take any value.
+            /// Operators of the logical expression.
+            FUNCTION_NOT,
+            FUNCTION_AND,
+            FUNCTION_OR,
+            /// Constants
+            ALWAYS_FALSE,
+            ALWAYS_TRUE,
+        };
+
+        RPNElement(Function function_ = FUNCTION_UNKNOWN) : function(function_) {} /// NOLINT
+
+        Function function = FUNCTION_UNKNOWN;
+        std::vector<std::pair<size_t, ColumnPtr>> predicate;
+    };
+
+    MergeTreeIndexConditionBloomFilter(const ActionsDAGPtr & filter_actions_dag, ContextPtr context_, const Block & header_, size_t hash_functions_);
+
+    bool alwaysUnknownOrTrue() const override;
+
+    bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const override
+    {
+        if (const auto & bf_granule = typeid_cast<const MergeTreeIndexGranuleBloomFilter *>(granule.get()))
+            return mayBeTrueOnGranule(bf_granule);
+
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Requires bloom filter index granule.");
+    }
+
+private:
+    const Block & header;
+    const size_t hash_functions;
+    std::vector<RPNElement> rpn;
+
+    bool mayBeTrueOnGranule(const MergeTreeIndexGranuleBloomFilter * granule) const;
+
+    bool extractAtomFromTree(const RPNBuilderTreeNode & node, RPNElement & out);
+
+    bool traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent);
+
+    bool traverseTreeIn(
+        const String & function_name,
+        const RPNBuilderTreeNode & key_node,
+        const ConstSetPtr & prepared_set,
+        const DataTypePtr & type,
+        const ColumnPtr & column,
+        RPNElement & out);
+
+    bool traverseTreeEquals(
+        const String & function_name,
+        const RPNBuilderTreeNode & key_node,
+        const DataTypePtr & value_type,
+        const Field & value_field,
+        RPNElement & out,
+        const RPNBuilderTreeNode * parent);
+};
+
+class MergeTreeIndexAggregatorBloomFilter final : public IMergeTreeIndexAggregator
+{
+public:
+    MergeTreeIndexAggregatorBloomFilter(size_t bits_per_row_, size_t hash_functions_, const Names & columns_name_);
+
+    bool empty() const override;
+
+    MergeTreeIndexGranulePtr getGranuleAndReset() override;
+
+    void update(const Block & block, size_t * pos, size_t limit) override;
+
+private:
+    size_t bits_per_row;
+    size_t hash_functions;
+    const Names index_columns_name;
+
+    std::vector<HashSet<UInt64>> column_hashes;
+    size_t total_rows = 0;
+};
+
+
 class MergeTreeIndexBloomFilter final : public IMergeTreeIndex
 {
 public:
diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp
deleted file mode 100644
index 7ab90dac5b0..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.cpp
+++ /dev/null
@@ -1,729 +0,0 @@
-#include <Common/HashTable/ClearableHashMap.h>
-#include <Common/FieldVisitorsAccurateComparison.h>
-#include <DataTypes/DataTypeArray.h>
-#include <DataTypes/DataTypeMap.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <Columns/ColumnConst.h>
-#include <Columns/ColumnTuple.h>
-#include <Storages/MergeTree/RPNBuilder.h>
-#include <Storages/MergeTree/MergeTreeIndexUtils.h>
-#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
-#include <Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h>
-#include <Parsers/ASTSubquery.h>
-#include <Parsers/ASTIdentifier.h>
-#include <Parsers/ASTLiteral.h>
-#include <Parsers/ASTSelectQuery.h>
-#include <Interpreters/misc.h>
-#include <Interpreters/BloomFilterHash.h>
-#include <Interpreters/castColumn.h>
-#include <Interpreters/convertFieldToType.h>
-
-namespace DB
-{
-namespace ErrorCodes
-{
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-    extern const int LOGICAL_ERROR;
-}
-
-namespace
-{
-
-ColumnWithTypeAndName getPreparedSetInfo(const ConstSetPtr & prepared_set)
-{
-    if (prepared_set->getDataTypes().size() == 1)
-        return {prepared_set->getSetElements()[0], prepared_set->getElementsTypes()[0], "dummy"};
-
-    Columns set_elements;
-    for (auto & set_element : prepared_set->getSetElements())
-
-        set_elements.emplace_back(set_element->convertToFullColumnIfConst());
-
-    return {ColumnTuple::create(set_elements), std::make_shared<DataTypeTuple>(prepared_set->getElementsTypes()), "dummy"};
-}
-
-bool hashMatchesFilter(const BloomFilterPtr& bloom_filter, UInt64 hash, size_t hash_functions)
-{
-    return std::all_of(BloomFilterHash::bf_hash_seed,
-                       BloomFilterHash::bf_hash_seed + hash_functions,
-                       [&](const auto &hash_seed)
-                       {
-                           return bloom_filter->findHashWithSeed(hash,
-                                                                 hash_seed);
-                       });
-}
-
-bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr & bloom_filter, size_t hash_functions, bool match_all)
-{
-    const auto * const_column = typeid_cast<const ColumnConst *>(hash_column);
-    const auto * non_const_column = typeid_cast<const ColumnUInt64 *>(hash_column);
-
-    if (!const_column && !non_const_column)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Hash column must be Const or UInt64.");
-
-    if (const_column)
-    {
-        return hashMatchesFilter(bloom_filter,
-                                 const_column->getValue<UInt64>(),
-                                 hash_functions);
-    }
-
-    const ColumnUInt64::Container & hashes = non_const_column->getData();
-
-    if (match_all)
-    {
-        return std::all_of(hashes.begin(),
-                           hashes.end(),
-                           [&](const auto& hash_row)
-                           {
-                               return hashMatchesFilter(bloom_filter,
-                                                        hash_row,
-                                                        hash_functions);
-                           });
-    }
-    else
-    {
-        return std::any_of(hashes.begin(),
-                           hashes.end(),
-                           [&](const auto& hash_row)
-                           {
-                               return hashMatchesFilter(bloom_filter,
-                                                        hash_row,
-                                                        hash_functions);
-                           });
-    }
-}
-
-}
-
-MergeTreeIndexConditionBloomFilter::MergeTreeIndexConditionBloomFilter(
-    const ActionsDAGPtr & filter_actions_dag, ContextPtr context_, const Block & header_, size_t hash_functions_)
-    : WithContext(context_), header(header_), hash_functions(hash_functions_)
-{
-    if (!filter_actions_dag)
-    {
-        rpn.push_back(RPNElement::FUNCTION_UNKNOWN);
-        return;
-    }
-
-    RPNBuilder<RPNElement> builder(
-        filter_actions_dag->getOutputs().at(0),
-        context_,
-        [&](const RPNBuilderTreeNode & node, RPNElement & out) { return extractAtomFromTree(node, out); });
-    rpn = std::move(builder).extractRPN();
-}
-
-bool MergeTreeIndexConditionBloomFilter::alwaysUnknownOrTrue() const
-{
-    std::vector<bool> rpn_stack;
-
-    for (const auto & element : rpn)
-    {
-        if (element.function == RPNElement::FUNCTION_UNKNOWN
-            || element.function == RPNElement::ALWAYS_TRUE)
-        {
-            rpn_stack.push_back(true);
-        }
-        else if (element.function == RPNElement::FUNCTION_EQUALS
-            || element.function == RPNElement::FUNCTION_NOT_EQUALS
-            || element.function == RPNElement::FUNCTION_HAS
-            || element.function == RPNElement::FUNCTION_HAS_ANY
-            || element.function == RPNElement::FUNCTION_HAS_ALL
-            || element.function == RPNElement::FUNCTION_IN
-            || element.function == RPNElement::FUNCTION_NOT_IN
-            || element.function == RPNElement::ALWAYS_FALSE)
-        {
-            rpn_stack.push_back(false);
-        }
-        else if (element.function == RPNElement::FUNCTION_NOT)
-        {
-            // do nothing
-        }
-        else if (element.function == RPNElement::FUNCTION_AND)
-        {
-            auto arg1 = rpn_stack.back();
-            rpn_stack.pop_back();
-            auto arg2 = rpn_stack.back();
-            rpn_stack.back() = arg1 && arg2;
-        }
-        else if (element.function == RPNElement::FUNCTION_OR)
-        {
-            auto arg1 = rpn_stack.back();
-            rpn_stack.pop_back();
-            auto arg2 = rpn_stack.back();
-            rpn_stack.back() = arg1 || arg2;
-        }
-        else
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected function type in KeyCondition::RPNElement");
-    }
-
-    return rpn_stack[0];
-}
-
-bool MergeTreeIndexConditionBloomFilter::mayBeTrueOnGranule(const MergeTreeIndexGranuleBloomFilter * granule) const
-{
-    std::vector<BoolMask> rpn_stack;
-    const auto & filters = granule->getFilters();
-
-    for (const auto & element : rpn)
-    {
-        if (element.function == RPNElement::FUNCTION_UNKNOWN)
-        {
-            rpn_stack.emplace_back(true, true);
-        }
-        else if (element.function == RPNElement::FUNCTION_IN
-            || element.function == RPNElement::FUNCTION_NOT_IN
-            || element.function == RPNElement::FUNCTION_EQUALS
-            || element.function == RPNElement::FUNCTION_NOT_EQUALS
-            || element.function == RPNElement::FUNCTION_HAS
-            || element.function == RPNElement::FUNCTION_HAS_ANY
-            || element.function == RPNElement::FUNCTION_HAS_ALL)
-        {
-            bool match_rows = true;
-            bool match_all = element.function == RPNElement::FUNCTION_HAS_ALL;
-            const auto & predicate = element.predicate;
-            for (size_t index = 0; match_rows && index < predicate.size(); ++index)
-            {
-                const auto & query_index_hash = predicate[index];
-                const auto & filter = filters[query_index_hash.first];
-                const ColumnPtr & hash_column = query_index_hash.second;
-
-                match_rows = maybeTrueOnBloomFilter(&*hash_column,
-                                                    filter,
-                                                    hash_functions,
-                                                    match_all);
-            }
-
-            rpn_stack.emplace_back(match_rows, true);
-            if (element.function == RPNElement::FUNCTION_NOT_EQUALS || element.function == RPNElement::FUNCTION_NOT_IN)
-                rpn_stack.back() = !rpn_stack.back();
-        }
-        else if (element.function == RPNElement::FUNCTION_NOT)
-        {
-            rpn_stack.back() = !rpn_stack.back();
-        }
-        else if (element.function == RPNElement::FUNCTION_OR)
-        {
-            auto arg1 = rpn_stack.back();
-            rpn_stack.pop_back();
-            auto arg2 = rpn_stack.back();
-            rpn_stack.back() = arg1 | arg2;
-        }
-        else if (element.function == RPNElement::FUNCTION_AND)
-        {
-            auto arg1 = rpn_stack.back();
-            rpn_stack.pop_back();
-            auto arg2 = rpn_stack.back();
-            rpn_stack.back() = arg1 & arg2;
-        }
-        else if (element.function == RPNElement::ALWAYS_TRUE)
-        {
-            rpn_stack.emplace_back(true, false);
-        }
-        else if (element.function == RPNElement::ALWAYS_FALSE)
-        {
-            rpn_stack.emplace_back(false, true);
-        }
-        else
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected function type in KeyCondition::RPNElement");
-    }
-
-    if (rpn_stack.size() != 1)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::mayBeTrueInRange");
-
-    return rpn_stack[0].can_be_true;
-}
-
-bool MergeTreeIndexConditionBloomFilter::extractAtomFromTree(const RPNBuilderTreeNode & node, RPNElement & out)
-{
-    {
-        Field const_value;
-        DataTypePtr const_type;
-
-        if (node.tryGetConstant(const_value, const_type))
-        {
-            if (const_value.getType() == Field::Types::UInt64)
-            {
-                out.function = const_value.get<UInt64>() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
-                return true;
-            }
-
-            if (const_value.getType() == Field::Types::Int64)
-            {
-                out.function = const_value.get<Int64>() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
-                return true;
-            }
-
-            if (const_value.getType() == Field::Types::Float64)
-            {
-                out.function = const_value.get<Float64>() != 0.0 ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE;
-                return true;
-            }
-        }
-    }
-
-    return traverseFunction(node, out, nullptr /*parent*/);
-}
-
-bool MergeTreeIndexConditionBloomFilter::traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent)
-{
-    bool maybe_useful = false;
-
-    if (node.isFunction())
-    {
-        const auto function = node.toFunctionNode();
-        auto arguments_size = function.getArgumentsSize();
-        auto function_name = function.getFunctionName();
-
-        for (size_t i = 0; i < arguments_size; ++i)
-        {
-            auto argument = function.getArgumentAt(i);
-            if (traverseFunction(argument, out, &node))
-                maybe_useful = true;
-        }
-
-        if (arguments_size != 2)
-            return false;
-
-        auto lhs_argument = function.getArgumentAt(0);
-        auto rhs_argument = function.getArgumentAt(1);
-
-        if (functionIsInOrGlobalInOperator(function_name))
-        {
-            if (auto future_set = rhs_argument.tryGetPreparedSet(); future_set)
-            {
-                if (auto prepared_set = future_set->buildOrderedSetInplace(rhs_argument.getTreeContext().getQueryContext()); prepared_set)
-                {
-                    if (prepared_set->hasExplicitSetElements())
-                    {
-                        const auto prepared_info = getPreparedSetInfo(prepared_set);
-                        if (traverseTreeIn(function_name, lhs_argument, prepared_set, prepared_info.type, prepared_info.column, out))
-                            maybe_useful = true;
-                    }
-                }
-            }
-        }
-        else if (function_name == "equals" ||
-                 function_name == "notEquals" ||
-                 function_name == "has" ||
-                 function_name == "mapContains" ||
-                 function_name == "indexOf" ||
-                 function_name == "hasAny" ||
-                 function_name == "hasAll")
-        {
-            Field const_value;
-            DataTypePtr const_type;
-
-            if (rhs_argument.tryGetConstant(const_value, const_type))
-            {
-                if (traverseTreeEquals(function_name, lhs_argument, const_type, const_value, out, parent))
-                    maybe_useful = true;
-            }
-            else if (lhs_argument.tryGetConstant(const_value, const_type))
-            {
-                if (traverseTreeEquals(function_name, rhs_argument, const_type, const_value, out, parent))
-                    maybe_useful = true;
-            }
-        }
-    }
-
-    return maybe_useful;
-}
-
-bool MergeTreeIndexConditionBloomFilter::traverseTreeIn(
-    const String & function_name,
-    const RPNBuilderTreeNode & key_node,
-    const ConstSetPtr & prepared_set,
-    const DataTypePtr & type,
-    const ColumnPtr & column,
-    RPNElement & out)
-{
-    auto key_node_column_name = key_node.getColumnName();
-
-    if (header.has(key_node_column_name))
-    {
-        size_t row_size = column->size();
-        size_t position = header.getPositionByName(key_node_column_name);
-        const DataTypePtr & index_type = header.getByPosition(position).type;
-        const auto & converted_column = castColumn(ColumnWithTypeAndName{column, type, ""}, index_type);
-        out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(index_type, converted_column, 0, row_size)));
-
-        if (function_name == "in"  || function_name == "globalIn")
-            out.function = RPNElement::FUNCTION_IN;
-
-        if (function_name == "notIn"  || function_name == "globalNotIn")
-            out.function = RPNElement::FUNCTION_NOT_IN;
-
-        return true;
-    }
-
-    if (key_node.isFunction())
-    {
-        auto key_node_function = key_node.toFunctionNode();
-        auto key_node_function_name = key_node_function.getFunctionName();
-        size_t key_node_function_arguments_size = key_node_function.getArgumentsSize();
-
-        WhichDataType which(type);
-
-        if (which.isTuple() && key_node_function_name == "tuple")
-        {
-            const auto & tuple_column = typeid_cast<const ColumnTuple *>(column.get());
-            const auto & tuple_data_type = typeid_cast<const DataTypeTuple *>(type.get());
-
-            if (tuple_data_type->getElements().size() != key_node_function_arguments_size || tuple_column->getColumns().size() != key_node_function_arguments_size)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal types of arguments of function {}", function_name);
-
-            bool match_with_subtype = false;
-            const auto & sub_columns = tuple_column->getColumns();
-            const auto & sub_data_types = tuple_data_type->getElements();
-
-            for (size_t index = 0; index < key_node_function_arguments_size; ++index)
-                match_with_subtype |= traverseTreeIn(function_name, key_node_function.getArgumentAt(index), nullptr, sub_data_types[index], sub_columns[index], out);
-
-            return match_with_subtype;
-        }
-
-        if (key_node_function_name == "arrayElement")
-        {
-            /** Try to parse arrayElement for mapKeys index.
-              * It is important to ignore keys like column_map['Key'] IN ('') because if key does not exists in map
-              * we return default value for arrayElement.
-              *
-              * We cannot skip keys that does not exist in map if comparison is with default type value because
-              * that way we skip necessary granules where map key does not exists.
-              */
-            if (!prepared_set)
-                return false;
-
-            auto default_column_to_check = type->createColumnConstWithDefaultValue(1)->convertToFullColumnIfConst();
-            ColumnWithTypeAndName default_column_with_type_to_check { default_column_to_check, type, "" };
-            ColumnsWithTypeAndName default_columns_with_type_to_check = {default_column_with_type_to_check};
-            auto set_contains_default_value_predicate_column = prepared_set->execute(default_columns_with_type_to_check, false /*negative*/);
-            const auto & set_contains_default_value_predicate_column_typed = assert_cast<const ColumnUInt8 &>(*set_contains_default_value_predicate_column);
-            bool set_contain_default_value = set_contains_default_value_predicate_column_typed.getData()[0];
-            if (set_contain_default_value)
-                return false;
-
-            auto first_argument = key_node_function.getArgumentAt(0);
-            const auto column_name = first_argument.getColumnName();
-            auto map_keys_index_column_name = fmt::format("mapKeys({})", column_name);
-            auto map_values_index_column_name = fmt::format("mapValues({})", column_name);
-
-            if (header.has(map_keys_index_column_name))
-            {
-                /// For mapKeys we serialize key argument with bloom filter
-
-                auto second_argument = key_node_function.getArgumentAt(1);
-
-                Field constant_value;
-                DataTypePtr constant_type;
-
-                if (second_argument.tryGetConstant(constant_value, constant_type))
-                {
-                    size_t position = header.getPositionByName(map_keys_index_column_name);
-                    const DataTypePtr & index_type = header.getByPosition(position).type;
-                    const DataTypePtr actual_type = BloomFilter::getPrimitiveType(index_type);
-                    out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), constant_value)));
-                }
-                else
-                {
-                    return false;
-                }
-            }
-            else if (header.has(map_values_index_column_name))
-            {
-                /// For mapValues we serialize set with bloom filter
-
-                size_t row_size = column->size();
-                size_t position = header.getPositionByName(map_values_index_column_name);
-                const DataTypePtr & index_type = header.getByPosition(position).type;
-                const auto & array_type = assert_cast<const DataTypeArray &>(*index_type);
-                const auto & array_nested_type = array_type.getNestedType();
-                const auto & converted_column = castColumn(ColumnWithTypeAndName{column, type, ""}, array_nested_type);
-                out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(array_nested_type, converted_column, 0, row_size)));
-            }
-            else
-            {
-                return false;
-            }
-
-            if (function_name == "in"  || function_name == "globalIn")
-                out.function = RPNElement::FUNCTION_IN;
-
-            if (function_name == "notIn"  || function_name == "globalNotIn")
-                out.function = RPNElement::FUNCTION_NOT_IN;
-
-            return true;
-        }
-    }
-
-    return false;
-}
-
-
-static bool indexOfCanUseBloomFilter(const RPNBuilderTreeNode * parent)
-{
-    if (!parent)
-        return true;
-
-    if (!parent->isFunction())
-        return false;
-
-    auto function = parent->toFunctionNode();
-    auto function_name = function.getFunctionName();
-
-    /// `parent` is a function where `indexOf` is located.
-    /// Example: `indexOf(arr, x) = 1`, parent is a function named `equals`.
-    if (function_name == "and")
-    {
-        return true;
-    }
-    else if (function_name == "equals" /// notEquals is not applicable
-        || function_name == "greater" || function_name == "greaterOrEquals"
-        || function_name == "less" || function_name == "lessOrEquals")
-    {
-        size_t function_arguments_size = function.getArgumentsSize();
-        if (function_arguments_size != 2)
-            return false;
-
-        /// We don't allow constant expressions like `indexOf(arr, x) = 1 + 0` but it's negligible.
-
-        /// We should return true when the corresponding expression implies that the array contains the element.
-        /// Example: when `indexOf(arr, x)` > 10 is written, it means that arr definitely should contain the element
-        /// (at least at 11th position but it does not matter).
-
-        bool reversed = false;
-        Field constant_value;
-        DataTypePtr constant_type;
-
-        if (function.getArgumentAt(0).tryGetConstant(constant_value, constant_type))
-        {
-            reversed = true;
-        }
-        else if (function.getArgumentAt(1).tryGetConstant(constant_value, constant_type))
-        {
-        }
-        else
-        {
-            return false;
-        }
-
-        Field zero(0);
-        bool constant_equal_zero = applyVisitor(FieldVisitorAccurateEquals(), constant_value, zero);
-
-        if (function_name == "equals" && !constant_equal_zero)
-        {
-            /// indexOf(...) = c, c != 0
-            return true;
-        }
-        else if (function_name == "notEquals" && constant_equal_zero)
-        {
-            /// indexOf(...) != c, c = 0
-            return true;
-        }
-        else if (function_name == (reversed ? "less" : "greater") && !applyVisitor(FieldVisitorAccurateLess(), constant_value, zero))
-        {
-            /// indexOf(...) > c, c >= 0
-            return true;
-        }
-        else if (function_name == (reversed ? "lessOrEquals" : "greaterOrEquals") && applyVisitor(FieldVisitorAccurateLess(), zero, constant_value))
-        {
-            /// indexOf(...) >= c, c > 0
-            return true;
-        }
-
-        return false;
-    }
-
-    return false;
-}
-
-
-bool MergeTreeIndexConditionBloomFilter::traverseTreeEquals(
-    const String & function_name,
-    const RPNBuilderTreeNode & key_node,
-    const DataTypePtr & value_type,
-    const Field & value_field,
-    RPNElement & out,
-    const RPNBuilderTreeNode * parent)
-{
-    auto key_column_name = key_node.getColumnName();
-
-    if (header.has(key_column_name))
-    {
-        size_t position = header.getPositionByName(key_column_name);
-        const DataTypePtr & index_type = header.getByPosition(position).type;
-        const auto * array_type = typeid_cast<const DataTypeArray *>(index_type.get());
-
-        if (function_name == "has" || function_name == "indexOf")
-        {
-            if (!array_type)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", function_name);
-
-            /// We can treat `indexOf` function similar to `has`.
-            /// But it is little more cumbersome, compare: `has(arr, elem)` and `indexOf(arr, elem) != 0`.
-            /// The `parent` in this context is expected to be function `!=` (`notEquals`).
-            if (function_name == "has" || indexOfCanUseBloomFilter(parent))
-            {
-                out.function = RPNElement::FUNCTION_HAS;
-                const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType());
-                auto converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
-                if (converted_field.isNull())
-                    return false;
-
-                out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
-            }
-        }
-        else if (function_name == "hasAny" || function_name == "hasAll")
-        {
-            if (!array_type)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", function_name);
-
-            if (value_field.getType() != Field::Types::Array)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument for function {} must be an array.", function_name);
-
-            const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType());
-            ColumnPtr column;
-            {
-                const bool is_nullable = actual_type->isNullable();
-                auto mutable_column = actual_type->createColumn();
-
-                for (const auto & f : value_field.get<Array>())
-                {
-                    if ((f.isNull() && !is_nullable) || f.isDecimal(f.getType())) /// NOLINT(readability-static-accessed-through-instance)
-                        return false;
-
-                    auto converted = convertFieldToType(f, *actual_type);
-                    if (converted.isNull())
-                        return false;
-
-                    mutable_column->insert(converted);
-                }
-
-                column = std::move(mutable_column);
-            }
-
-            out.function = function_name == "hasAny" ?
-                RPNElement::FUNCTION_HAS_ANY :
-                RPNElement::FUNCTION_HAS_ALL;
-            out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(actual_type, column, 0, column->size())));
-        }
-        else
-        {
-            if (array_type)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                                "An array type of bloom_filter supports only has(), indexOf(), and hasAny() functions.");
-
-            out.function = function_name == "equals" ? RPNElement::FUNCTION_EQUALS : RPNElement::FUNCTION_NOT_EQUALS;
-            const DataTypePtr actual_type = BloomFilter::getPrimitiveType(index_type);
-            auto converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
-            if (converted_field.isNull())
-                return false;
-
-            out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
-        }
-
-        return true;
-    }
-
-    if (function_name == "mapContains" || function_name == "has")
-    {
-        auto map_keys_index_column_name = fmt::format("mapKeys({})", key_column_name);
-        if (!header.has(map_keys_index_column_name))
-            return false;
-
-        size_t position = header.getPositionByName(map_keys_index_column_name);
-        const DataTypePtr & index_type = header.getByPosition(position).type;
-        const auto * array_type = typeid_cast<const DataTypeArray *>(index_type.get());
-
-        if (!array_type)
-            return false;
-
-        out.function = RPNElement::FUNCTION_HAS;
-        const DataTypePtr actual_type = BloomFilter::getPrimitiveType(array_type->getNestedType());
-        auto converted_field = convertFieldToType(value_field, *actual_type, value_type.get());
-        if (converted_field.isNull())
-            return false;
-
-        out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
-        return true;
-    }
-
-    if (key_node.isFunction())
-    {
-        WhichDataType which(value_type);
-
-        auto key_node_function = key_node.toFunctionNode();
-        auto key_node_function_name = key_node_function.getFunctionName();
-        size_t key_node_function_arguments_size = key_node_function.getArgumentsSize();
-
-        if (which.isTuple() && key_node_function_name == "tuple")
-        {
-            const Tuple & tuple = value_field.get<const Tuple &>();
-            const auto * value_tuple_data_type = typeid_cast<const DataTypeTuple *>(value_type.get());
-
-            if (tuple.size() != key_node_function_arguments_size)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal types of arguments of function {}", function_name);
-
-            bool match_with_subtype = false;
-            const DataTypes & subtypes = value_tuple_data_type->getElements();
-
-            for (size_t index = 0; index < tuple.size(); ++index)
-                match_with_subtype |= traverseTreeEquals(function_name, key_node_function.getArgumentAt(index), subtypes[index], tuple[index], out, &key_node);
-
-            return match_with_subtype;
-        }
-
-        if (key_node_function_name == "arrayElement" && (function_name == "equals" || function_name == "notEquals"))
-        {
-            /** Try to parse arrayElement for mapKeys index.
-              * It is important to ignore keys like column_map['Key'] = '' because if key does not exists in map
-              * we return default value for arrayElement.
-              *
-              * We cannot skip keys that does not exist in map if comparison is with default type value because
-              * that way we skip necessary granules where map key does not exists.
-              */
-            if (value_field == value_type->getDefault())
-                return false;
-
-            auto first_argument = key_node_function.getArgumentAt(0);
-            const auto column_name = first_argument.getColumnName();
-
-            auto map_keys_index_column_name = fmt::format("mapKeys({})", column_name);
-            auto map_values_index_column_name = fmt::format("mapValues({})", column_name);
-
-            size_t position = 0;
-            Field const_value = value_field;
-            DataTypePtr const_type;
-
-            if (header.has(map_keys_index_column_name))
-            {
-                position = header.getPositionByName(map_keys_index_column_name);
-                auto second_argument = key_node_function.getArgumentAt(1);
-
-                if (!second_argument.tryGetConstant(const_value, const_type))
-                    return false;
-            }
-            else if (header.has(map_values_index_column_name))
-            {
-                position = header.getPositionByName(map_values_index_column_name);
-            }
-            else
-            {
-                return false;
-            }
-
-            out.function = function_name == "equals" ? RPNElement::FUNCTION_EQUALS : RPNElement::FUNCTION_NOT_EQUALS;
-
-            const auto & index_type = header.getByPosition(position).type;
-            const auto actual_type = BloomFilter::getPrimitiveType(index_type);
-            out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), const_value)));
-
-            return true;
-        }
-    }
-
-    return false;
-}
-
-}
diff --git a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h
deleted file mode 100644
index 8029d6d405b..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexConditionBloomFilter.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#pragma once
-
-#include <Columns/IColumn.h>
-#include <Interpreters/BloomFilter.h>
-#include <Storages/MergeTree/KeyCondition.h>
-#include <Storages/MergeTree/MergeTreeIndices.h>
-#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
-
-namespace DB
-{
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
-class MergeTreeIndexConditionBloomFilter final : public IMergeTreeIndexCondition, WithContext
-{
-public:
-    struct RPNElement
-    {
-        enum Function
-        {
-            /// Atoms of a Boolean expression.
-            FUNCTION_EQUALS,
-            FUNCTION_NOT_EQUALS,
-            FUNCTION_HAS,
-            FUNCTION_HAS_ANY,
-            FUNCTION_HAS_ALL,
-            FUNCTION_IN,
-            FUNCTION_NOT_IN,
-            FUNCTION_UNKNOWN, /// Can take any value.
-            /// Operators of the logical expression.
-            FUNCTION_NOT,
-            FUNCTION_AND,
-            FUNCTION_OR,
-            /// Constants
-            ALWAYS_FALSE,
-            ALWAYS_TRUE,
-        };
-
-        RPNElement(Function function_ = FUNCTION_UNKNOWN) : function(function_) {} /// NOLINT
-
-        Function function = FUNCTION_UNKNOWN;
-        std::vector<std::pair<size_t, ColumnPtr>> predicate;
-    };
-
-    MergeTreeIndexConditionBloomFilter(const ActionsDAGPtr & filter_actions_dag, ContextPtr context_, const Block & header_, size_t hash_functions_);
-
-    bool alwaysUnknownOrTrue() const override;
-
-    bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const override
-    {
-        if (const auto & bf_granule = typeid_cast<const MergeTreeIndexGranuleBloomFilter *>(granule.get()))
-            return mayBeTrueOnGranule(bf_granule);
-
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Requires bloom filter index granule.");
-    }
-
-private:
-    const Block & header;
-    const size_t hash_functions;
-    std::vector<RPNElement> rpn;
-
-    bool mayBeTrueOnGranule(const MergeTreeIndexGranuleBloomFilter * granule) const;
-
-    bool extractAtomFromTree(const RPNBuilderTreeNode & node, RPNElement & out);
-
-    bool traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent);
-
-    bool traverseTreeIn(
-        const String & function_name,
-        const RPNBuilderTreeNode & key_node,
-        const ConstSetPtr & prepared_set,
-        const DataTypePtr & type,
-        const ColumnPtr & column,
-        RPNElement & out);
-
-    bool traverseTreeEquals(
-        const String & function_name,
-        const RPNBuilderTreeNode & key_node,
-        const DataTypePtr & value_type,
-        const Field & value_field,
-        RPNElement & out,
-        const RPNBuilderTreeNode * parent);
-};
-
-}
diff --git a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp
deleted file mode 100644
index 8355cac8033..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h>
-#include <Columns/ColumnArray.h>
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnNullable.h>
-#include <Columns/ColumnFixedString.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <Common/HashTable/Hash.h>
-#include <Interpreters/BloomFilterHash.h>
-#include <IO/WriteHelpers.h>
-
-namespace DB
-{
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
-MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_)
-    : bits_per_row(bits_per_row_), hash_functions(hash_functions_), bloom_filters(index_columns_)
-{
-    total_rows = 0;
-    for (size_t column = 0; column < index_columns_; ++column)
-        bloom_filters[column] = std::make_shared<BloomFilter>(bits_per_row, hash_functions, 0);
-}
-
-MergeTreeIndexGranuleBloomFilter::MergeTreeIndexGranuleBloomFilter(
-    size_t bits_per_row_, size_t hash_functions_, const std::vector<HashSet<UInt64>>& column_hashes_)
-        : bits_per_row(bits_per_row_), hash_functions(hash_functions_), bloom_filters(column_hashes_.size())
-{
-    if (column_hashes_.empty())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule_index_blocks empty or total_rows is zero.");
-
-    size_t bloom_filter_max_size = 0;
-    for (const auto & column_hash : column_hashes_)
-        bloom_filter_max_size = std::max(bloom_filter_max_size, column_hash.size());
-
-    static size_t atom_size = 8;
-
-    // If multiple columns are given, we will initialize all the bloom filters
-    // with the size of the highest-cardinality one. This is done for compatibility with
-    // existing binary serialization format
-    total_rows = bloom_filter_max_size;
-    size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
-
-    for (size_t column = 0, columns = column_hashes_.size(); column < columns; ++column)
-    {
-        bloom_filters[column] = std::make_shared<BloomFilter>(bytes_size, hash_functions, 0);
-        fillingBloomFilter(bloom_filters[column], column_hashes_[column]);
-    }
-}
-
-bool MergeTreeIndexGranuleBloomFilter::empty() const
-{
-    return !total_rows;
-}
-
-void MergeTreeIndexGranuleBloomFilter::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
-{
-    if (version != 1)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version);
-
-    readVarUInt(total_rows, istr);
-
-    static size_t atom_size = 8;
-    size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
-    size_t read_size = bytes_size;
-    for (auto & filter : bloom_filters)
-    {
-        filter->resize(bytes_size);
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-        read_size = filter->getFilter().size() * sizeof(BloomFilter::UnderType);
-#endif
-        istr.readStrict(reinterpret_cast<char *>(filter->getFilter().data()), read_size);
-    }
-}
-
-void MergeTreeIndexGranuleBloomFilter::serializeBinary(WriteBuffer & ostr) const
-{
-    if (empty())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty bloom filter index.");
-
-    writeVarUInt(total_rows, ostr);
-
-    static size_t atom_size = 8;
-    size_t write_size = (bits_per_row * total_rows + atom_size - 1) / atom_size;
-    for (const auto & bloom_filter : bloom_filters)
-    {
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-        write_size = bloom_filter->getFilter().size() * sizeof(BloomFilter::UnderType);
-#endif
-        ostr.write(reinterpret_cast<const char *>(bloom_filter->getFilter().data()), write_size);
-    }
-}
-
-void MergeTreeIndexGranuleBloomFilter::fillingBloomFilter(BloomFilterPtr & bf, const HashSet<UInt64> &hashes) const
-{
-    for (const auto & bf_base_hash : hashes)
-        for (size_t i = 0; i < hash_functions; ++i)
-            bf->addHashWithSeed(bf_base_hash.getKey(), BloomFilterHash::bf_hash_seed[i]);
-}
-
-}
diff --git a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h
deleted file mode 100644
index a3434daa5a4..00000000000
--- a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include <Interpreters/BloomFilter.h>
-#include <Storages/MergeTree/MergeTreeIndices.h>
-#include <Common/HashTable/HashSet.h>
-
-namespace DB
-{
-
-class MergeTreeIndexGranuleBloomFilter final : public IMergeTreeIndexGranule
-{
-public:
-    MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, size_t index_columns_);
-
-    MergeTreeIndexGranuleBloomFilter(size_t bits_per_row_, size_t hash_functions_, const std::vector<HashSet<UInt64>> & column_hashes);
-
-    bool empty() const override;
-
-    void serializeBinary(WriteBuffer & ostr) const override;
-    void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
-
-    const std::vector<BloomFilterPtr> & getFilters() const { return bloom_filters; }
-
-private:
-    const size_t bits_per_row;
-    const size_t hash_functions;
-
-    size_t total_rows = 0;
-    std::vector<BloomFilterPtr> bloom_filters;
-
-    void fillingBloomFilter(BloomFilterPtr & bf, const HashSet<UInt64> & hashes) const;
-};
-
-
-}

From 0ae422d33c0538f759c64444cc8b7a6f8f2a5b63 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 23 Apr 2024 14:11:12 +0000
Subject: [PATCH 043/192] Rename MergeTreeIndexFullText -->
 MergeTreeIndexBloomFilterText

---
 src/Interpreters/GinFilter.cpp                |  2 +-
 .../MergeTree/MergeTreeIndexBloomFilter.cpp   |  4 +-
 ....cpp => MergeTreeIndexBloomFilterText.cpp} | 56 +++++++++----------
 ...Text.h => MergeTreeIndexBloomFilterText.h} | 28 +++++-----
 .../MergeTree/MergeTreeIndexInverted.h        |  1 -
 src/Storages/MergeTree/MergeTreeIndices.cpp   | 12 ++--
 src/Storages/MergeTree/MergeTreeIndices.h     |  6 +-
 .../tests/gtest_SplitTokenExtractor.cpp       |  2 +-
 8 files changed, 55 insertions(+), 56 deletions(-)
 rename src/Storages/MergeTree/{MergeTreeIndexFullText.cpp => MergeTreeIndexBloomFilterText.cpp} (92%)
 rename src/Storages/MergeTree/{MergeTreeIndexFullText.h => MergeTreeIndexBloomFilterText.h} (84%)

diff --git a/src/Interpreters/GinFilter.cpp b/src/Interpreters/GinFilter.cpp
index 5d823318313..1ce26ed1fd1 100644
--- a/src/Interpreters/GinFilter.cpp
+++ b/src/Interpreters/GinFilter.cpp
@@ -7,7 +7,7 @@
 #include <Disks/DiskLocal.h>
 #include <Interpreters/GinFilter.h>
 #include <Storages/MergeTree/GinIndexStore.h>
-#include <Storages/MergeTree/MergeTreeIndexFullText.h>
+#include <Storages/MergeTree/MergeTreeIndexBloomFilterText.h>
 #include <Storages/MergeTree/MergeTreeIndexInverted.h>
 #include <string>
 #include <algorithm>
diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
index ed091022a91..4f25a014382 100644
--- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.cpp
@@ -921,7 +921,7 @@ static void assertIndexColumnsType(const Block & header)
     }
 }
 
-MergeTreeIndexPtr bloomFilterIndexCreatorNew(
+MergeTreeIndexPtr bloomFilterIndexCreator(
     const IndexDescription & index)
 {
     double max_conflict_probability = 0.025;
@@ -938,7 +938,7 @@ MergeTreeIndexPtr bloomFilterIndexCreatorNew(
         index, bits_per_row_and_size_of_hash_functions.first, bits_per_row_and_size_of_hash_functions.second);
 }
 
-void bloomFilterIndexValidatorNew(const IndexDescription & index, bool attach)
+void bloomFilterIndexValidator(const IndexDescription & index, bool attach)
 {
     assertIndexColumnsType(index.sample_block);
 
diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp
similarity index 92%
rename from src/Storages/MergeTree/MergeTreeIndexFullText.cpp
rename to src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp
index 4cd616513ac..826b149cf01 100644
--- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.cpp
@@ -1,4 +1,4 @@
-#include <Storages/MergeTree/MergeTreeIndexFullText.h>
+#include <Storages/MergeTree/MergeTreeIndexBloomFilterText.h>
 
 #include <Columns/ColumnArray.h>
 #include <Common/OptimizedRegularExpression.h>
@@ -32,7 +32,7 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
-MergeTreeIndexGranuleFullText::MergeTreeIndexGranuleFullText(
+MergeTreeIndexGranuleBloomFilterText::MergeTreeIndexGranuleBloomFilterText(
     const String & index_name_,
     size_t columns_number,
     const BloomFilterParameters & params_)
@@ -44,7 +44,7 @@ MergeTreeIndexGranuleFullText::MergeTreeIndexGranuleFullText(
 {
 }
 
-void MergeTreeIndexGranuleFullText::serializeBinary(WriteBuffer & ostr) const
+void MergeTreeIndexGranuleBloomFilterText::serializeBinary(WriteBuffer & ostr) const
 {
     if (empty())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty fulltext index {}.", backQuote(index_name));
@@ -53,7 +53,7 @@ void MergeTreeIndexGranuleFullText::serializeBinary(WriteBuffer & ostr) const
         ostr.write(reinterpret_cast<const char *>(bloom_filter.getFilter().data()), params.filter_size);
 }
 
-void MergeTreeIndexGranuleFullText::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
+void MergeTreeIndexGranuleBloomFilterText::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
 {
     if (version != 1)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version);
@@ -66,7 +66,7 @@ void MergeTreeIndexGranuleFullText::deserializeBinary(ReadBuffer & istr, MergeTr
 }
 
 
-MergeTreeIndexAggregatorFullText::MergeTreeIndexAggregatorFullText(
+MergeTreeIndexAggregatorBloomFilterText::MergeTreeIndexAggregatorBloomFilterText(
     const Names & index_columns_,
     const String & index_name_,
     const BloomFilterParameters & params_,
@@ -76,20 +76,20 @@ MergeTreeIndexAggregatorFullText::MergeTreeIndexAggregatorFullText(
     , params(params_)
     , token_extractor(token_extractor_)
     , granule(
-        std::make_shared<MergeTreeIndexGranuleFullText>(
+        std::make_shared<MergeTreeIndexGranuleBloomFilterText>(
             index_name, index_columns.size(), params))
 {
 }
 
-MergeTreeIndexGranulePtr MergeTreeIndexAggregatorFullText::getGranuleAndReset()
+MergeTreeIndexGranulePtr MergeTreeIndexAggregatorBloomFilterText::getGranuleAndReset()
 {
-    auto new_granule = std::make_shared<MergeTreeIndexGranuleFullText>(
+    auto new_granule = std::make_shared<MergeTreeIndexGranuleBloomFilterText>(
         index_name, index_columns.size(), params);
     new_granule.swap(granule);
     return new_granule;
 }
 
-void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, size_t limit)
+void MergeTreeIndexAggregatorBloomFilterText::update(const Block & block, size_t * pos, size_t limit)
 {
     if (*pos >= block.rows())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. "
@@ -137,7 +137,7 @@ void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos,
     *pos += rows_read;
 }
 
-MergeTreeConditionFullText::MergeTreeConditionFullText(
+MergeTreeConditionBloomFilterText::MergeTreeConditionBloomFilterText(
     const ActionsDAGPtr & filter_actions_dag,
     ContextPtr context,
     const Block & index_sample_block,
@@ -162,7 +162,7 @@ MergeTreeConditionFullText::MergeTreeConditionFullText(
 }
 
 /// Keep in-sync with MergeTreeConditionGinFilter::alwaysUnknownOrTrue
-bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
+bool MergeTreeConditionBloomFilterText::alwaysUnknownOrTrue() const
 {
     /// Check like in KeyCondition.
     std::vector<bool> rpn_stack;
@@ -212,10 +212,10 @@ bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
 }
 
 /// Keep in-sync with MergeTreeIndexConditionGin::mayBeTrueOnTranuleInPart
-bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const
+bool MergeTreeConditionBloomFilterText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const
 {
-    std::shared_ptr<MergeTreeIndexGranuleFullText> granule
-            = std::dynamic_pointer_cast<MergeTreeIndexGranuleFullText>(idx_granule);
+    std::shared_ptr<MergeTreeIndexGranuleBloomFilterText> granule
+            = std::dynamic_pointer_cast<MergeTreeIndexGranuleBloomFilterText>(idx_granule);
     if (!granule)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "BloomFilter index condition got a granule with the wrong type.");
 
@@ -323,13 +323,13 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx
     return rpn_stack[0].can_be_true;
 }
 
-std::optional<size_t> MergeTreeConditionFullText::getKeyIndex(const std::string & key_column_name)
+std::optional<size_t> MergeTreeConditionBloomFilterText::getKeyIndex(const std::string & key_column_name)
 {
     const auto it = std::ranges::find(index_columns, key_column_name);
     return it == index_columns.end() ? std::nullopt : std::make_optional<size_t>(std::ranges::distance(index_columns.cbegin(), it));
 }
 
-bool MergeTreeConditionFullText::extractAtomFromTree(const RPNBuilderTreeNode & node, RPNElement & out)
+bool MergeTreeConditionBloomFilterText::extractAtomFromTree(const RPNBuilderTreeNode & node, RPNElement & out)
 {
     {
         Field const_value;
@@ -419,7 +419,7 @@ bool MergeTreeConditionFullText::extractAtomFromTree(const RPNBuilderTreeNode &
     return false;
 }
 
-bool MergeTreeConditionFullText::traverseTreeEquals(
+bool MergeTreeConditionBloomFilterText::traverseTreeEquals(
     const String & function_name,
     const RPNBuilderTreeNode & key_node,
     const DataTypePtr & value_type,
@@ -638,7 +638,7 @@ bool MergeTreeConditionFullText::traverseTreeEquals(
     return false;
 }
 
-bool MergeTreeConditionFullText::tryPrepareSetBloomFilter(
+bool MergeTreeConditionBloomFilterText::tryPrepareSetBloomFilter(
     const RPNBuilderTreeNode & left_argument,
     const RPNBuilderTreeNode & right_argument,
     RPNElement & out)
@@ -714,23 +714,23 @@ bool MergeTreeConditionFullText::tryPrepareSetBloomFilter(
     return true;
 }
 
-MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const
+MergeTreeIndexGranulePtr MergeTreeIndexBloomFilterText::createIndexGranule() const
 {
-    return std::make_shared<MergeTreeIndexGranuleFullText>(index.name, index.column_names.size(), params);
+    return std::make_shared<MergeTreeIndexGranuleBloomFilterText>(index.name, index.column_names.size(), params);
 }
 
-MergeTreeIndexAggregatorPtr MergeTreeIndexFullText::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
+MergeTreeIndexAggregatorPtr MergeTreeIndexBloomFilterText::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
 {
-    return std::make_shared<MergeTreeIndexAggregatorFullText>(index.column_names, index.name, params, token_extractor.get());
+    return std::make_shared<MergeTreeIndexAggregatorBloomFilterText>(index.column_names, index.name, params, token_extractor.get());
 }
 
-MergeTreeIndexConditionPtr MergeTreeIndexFullText::createIndexCondition(
+MergeTreeIndexConditionPtr MergeTreeIndexBloomFilterText::createIndexCondition(
         const ActionsDAGPtr & filter_dag, ContextPtr context) const
 {
-    return std::make_shared<MergeTreeConditionFullText>(filter_dag, context, index.sample_block, params, token_extractor.get());
+    return std::make_shared<MergeTreeConditionBloomFilterText>(filter_dag, context, index.sample_block, params, token_extractor.get());
 }
 
-MergeTreeIndexPtr bloomFilterIndexCreator(
+MergeTreeIndexPtr bloomFilterIndexTextCreator(
     const IndexDescription & index)
 {
     if (index.type == NgramTokenExtractor::getName())
@@ -743,7 +743,7 @@ MergeTreeIndexPtr bloomFilterIndexCreator(
 
         auto tokenizer = std::make_unique<NgramTokenExtractor>(n);
 
-        return std::make_shared<MergeTreeIndexFullText>(index, params, std::move(tokenizer));
+        return std::make_shared<MergeTreeIndexBloomFilterText>(index, params, std::move(tokenizer));
     }
     else if (index.type == SplitTokenExtractor::getName())
     {
@@ -754,7 +754,7 @@ MergeTreeIndexPtr bloomFilterIndexCreator(
 
         auto tokenizer = std::make_unique<SplitTokenExtractor>();
 
-        return std::make_shared<MergeTreeIndexFullText>(index, params, std::move(tokenizer));
+        return std::make_shared<MergeTreeIndexBloomFilterText>(index, params, std::move(tokenizer));
     }
     else
     {
@@ -762,7 +762,7 @@ MergeTreeIndexPtr bloomFilterIndexCreator(
     }
 }
 
-void bloomFilterIndexValidator(const IndexDescription & index, bool /*attach*/)
+void bloomFilterIndexTextValidator(const IndexDescription & index, bool /*attach*/)
 {
     for (const auto & index_data_type : index.data_types)
     {
diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.h b/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.h
similarity index 84%
rename from src/Storages/MergeTree/MergeTreeIndexFullText.h
rename to src/Storages/MergeTree/MergeTreeIndexBloomFilterText.h
index e66f498ce1d..6fd969030df 100644
--- a/src/Storages/MergeTree/MergeTreeIndexFullText.h
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilterText.h
@@ -11,14 +11,14 @@
 namespace DB
 {
 
-struct MergeTreeIndexGranuleFullText final : public IMergeTreeIndexGranule
+struct MergeTreeIndexGranuleBloomFilterText final : public IMergeTreeIndexGranule
 {
-    explicit MergeTreeIndexGranuleFullText(
+    explicit MergeTreeIndexGranuleBloomFilterText(
         const String & index_name_,
         size_t columns_number,
         const BloomFilterParameters & params_);
 
-    ~MergeTreeIndexGranuleFullText() override = default;
+    ~MergeTreeIndexGranuleBloomFilterText() override = default;
 
     void serializeBinary(WriteBuffer & ostr) const override;
     void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
@@ -32,17 +32,17 @@ struct MergeTreeIndexGranuleFullText final : public IMergeTreeIndexGranule
     bool has_elems;
 };
 
-using MergeTreeIndexGranuleFullTextPtr = std::shared_ptr<MergeTreeIndexGranuleFullText>;
+using MergeTreeIndexGranuleBloomFilterTextPtr = std::shared_ptr<MergeTreeIndexGranuleBloomFilterText>;
 
-struct MergeTreeIndexAggregatorFullText final : IMergeTreeIndexAggregator
+struct MergeTreeIndexAggregatorBloomFilterText final : IMergeTreeIndexAggregator
 {
-    explicit MergeTreeIndexAggregatorFullText(
+    explicit MergeTreeIndexAggregatorBloomFilterText(
         const Names & index_columns_,
         const String & index_name_,
         const BloomFilterParameters & params_,
         TokenExtractorPtr token_extractor_);
 
-    ~MergeTreeIndexAggregatorFullText() override = default;
+    ~MergeTreeIndexAggregatorBloomFilterText() override = default;
 
     bool empty() const override { return !granule || granule->empty(); }
     MergeTreeIndexGranulePtr getGranuleAndReset() override;
@@ -54,21 +54,21 @@ struct MergeTreeIndexAggregatorFullText final : IMergeTreeIndexAggregator
     BloomFilterParameters params;
     TokenExtractorPtr token_extractor;
 
-    MergeTreeIndexGranuleFullTextPtr granule;
+    MergeTreeIndexGranuleBloomFilterTextPtr granule;
 };
 
 
-class MergeTreeConditionFullText final : public IMergeTreeIndexCondition
+class MergeTreeConditionBloomFilterText final : public IMergeTreeIndexCondition
 {
 public:
-    MergeTreeConditionFullText(
+    MergeTreeConditionBloomFilterText(
             const ActionsDAGPtr & filter_actions_dag,
             ContextPtr context,
             const Block & index_sample_block,
             const BloomFilterParameters & params_,
             TokenExtractorPtr token_extactor_);
 
-    ~MergeTreeConditionFullText() override = default;
+    ~MergeTreeConditionBloomFilterText() override = default;
 
     bool alwaysUnknownOrTrue() const override;
     bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx_granule) const override;
@@ -146,10 +146,10 @@ private:
     RPN rpn;
 };
 
-class MergeTreeIndexFullText final : public IMergeTreeIndex
+class MergeTreeIndexBloomFilterText final : public IMergeTreeIndex
 {
 public:
-    MergeTreeIndexFullText(
+    MergeTreeIndexBloomFilterText(
         const IndexDescription & index_,
         const BloomFilterParameters & params_,
         std::unique_ptr<ITokenExtractor> && token_extractor_)
@@ -157,7 +157,7 @@ public:
         , params(params_)
         , token_extractor(std::move(token_extractor_)) {}
 
-    ~MergeTreeIndexFullText() override = default;
+    ~MergeTreeIndexBloomFilterText() override = default;
 
     MergeTreeIndexGranulePtr createIndexGranule() const override;
     MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override;
diff --git a/src/Storages/MergeTree/MergeTreeIndexInverted.h b/src/Storages/MergeTree/MergeTreeIndexInverted.h
index f3c1f37e364..bab4e122aa6 100644
--- a/src/Storages/MergeTree/MergeTreeIndexInverted.h
+++ b/src/Storages/MergeTree/MergeTreeIndexInverted.h
@@ -5,7 +5,6 @@
 #include <Storages/MergeTree/KeyCondition.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <base/types.h>
-#include <atomic>
 #include <memory>
 
 namespace DB
diff --git a/src/Storages/MergeTree/MergeTreeIndices.cpp b/src/Storages/MergeTree/MergeTreeIndices.cpp
index 322cdd35afe..be8b4c795f0 100644
--- a/src/Storages/MergeTree/MergeTreeIndices.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndices.cpp
@@ -115,14 +115,14 @@ MergeTreeIndexFactory::MergeTreeIndexFactory()
     registerCreator("set", setIndexCreator);
     registerValidator("set", setIndexValidator);
 
-    registerCreator("ngrambf_v1", bloomFilterIndexCreator);
-    registerValidator("ngrambf_v1", bloomFilterIndexValidator);
+    registerCreator("ngrambf_v1", bloomFilterIndexTextCreator);
+    registerValidator("ngrambf_v1", bloomFilterIndexTextValidator);
 
-    registerCreator("tokenbf_v1", bloomFilterIndexCreator);
-    registerValidator("tokenbf_v1", bloomFilterIndexValidator);
+    registerCreator("tokenbf_v1", bloomFilterIndexTextCreator);
+    registerValidator("tokenbf_v1", bloomFilterIndexTextValidator);
 
-    registerCreator("bloom_filter", bloomFilterIndexCreatorNew);
-    registerValidator("bloom_filter", bloomFilterIndexValidatorNew);
+    registerCreator("bloom_filter", bloomFilterIndexCreator);
+    registerValidator("bloom_filter", bloomFilterIndexValidator);
 
     registerCreator("hypothesis", hypothesisIndexCreator);
     registerValidator("hypothesis", hypothesisIndexValidator);
diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h
index 8fdadb4e5eb..900e6b6658c 100644
--- a/src/Storages/MergeTree/MergeTreeIndices.h
+++ b/src/Storages/MergeTree/MergeTreeIndices.h
@@ -221,12 +221,12 @@ void minmaxIndexValidator(const IndexDescription & index, bool attach);
 MergeTreeIndexPtr setIndexCreator(const IndexDescription & index);
 void setIndexValidator(const IndexDescription & index, bool attach);
 
+MergeTreeIndexPtr bloomFilterIndexTextCreator(const IndexDescription & index);
+void bloomFilterIndexTextValidator(const IndexDescription & index, bool attach);
+
 MergeTreeIndexPtr bloomFilterIndexCreator(const IndexDescription & index);
 void bloomFilterIndexValidator(const IndexDescription & index, bool attach);
 
-MergeTreeIndexPtr bloomFilterIndexCreatorNew(const IndexDescription & index);
-void bloomFilterIndexValidatorNew(const IndexDescription & index, bool attach);
-
 MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index);
 void hypothesisIndexValidator(const IndexDescription & index, bool attach);
 
diff --git a/src/Storages/tests/gtest_SplitTokenExtractor.cpp b/src/Storages/tests/gtest_SplitTokenExtractor.cpp
index 62389639c11..e01673359bd 100644
--- a/src/Storages/tests/gtest_SplitTokenExtractor.cpp
+++ b/src/Storages/tests/gtest_SplitTokenExtractor.cpp
@@ -1,4 +1,4 @@
-#include <Storages/MergeTree/MergeTreeIndexFullText.h>
+#include <Storages/MergeTree/MergeTreeIndexBloomFilterText.h>
 
 #include <Common/PODArray_fwd.h>
 

From 7c24d4f48de6b62faf76445e729eea536dbec010 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Wed, 24 Apr 2024 03:33:53 +0300
Subject: [PATCH 044/192] Update Russian version of uuid-functions.md

---
 .../sql-reference/functions/uuid-functions.md | 238 ++++++++++++++++++
 1 file changed, 238 insertions(+)

diff --git a/docs/ru/sql-reference/functions/uuid-functions.md b/docs/ru/sql-reference/functions/uuid-functions.md
index 65d13079ee8..8f41d2ab6f4 100644
--- a/docs/ru/sql-reference/functions/uuid-functions.md
+++ b/docs/ru/sql-reference/functions/uuid-functions.md
@@ -51,6 +51,166 @@ SELECT generateUUIDv4(1), generateUUIDv4(2)
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
+## generateUUIDv7 {#uuidv7-function-generate}
+
+Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, и случайных данных в следующей последовательности:
+```
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |       rand_a          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                        rand_b                             |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+```
+
+**Синтаксис**
+
+``` sql
+generateUUIDv7([x])
+```
+
+**Аргументы**
+
+-   `x` — [выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр.
+   
+**Возвращаемое значение**
+
+Значение типа [UUID](../../sql-reference/functions/uuid-functions.md).
+
+**Пример использования**
+
+Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUIDv7.
+
+``` sql
+CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+
+INSERT INTO t_uuid SELECT generateUUIDv7()
+
+SELECT * FROM t_uuid
+```
+
+``` text
+┌────────────────────────────────────x─┐
+│ 018f05af-f4a8-778f-beee-1bedbc95c93b │
+└──────────────────────────────────────┘
+```
+
+**Пример использования, для генерации нескольких значений в одной строке**
+
+```sql
+SELECT generateUUIDv7(1), generateUUIDv7(7)
+┌─generateUUIDv7(1)────────────────────┬─generateUUIDv7(2)────────────────────┐
+│ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
+## generateUUIDv7WithCounter {#uuidv7withcounter-function-generate}
+
+Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, монотонно возрастающего счётчика для данной временной метки и случайных данных в указанной ниже последовательности. Для каждой новой временной метки счётчик стартует с нового случайного значения, а для следующих UUIDv7 он увеличивается на единицу. В случае переполнения счётчика временная метка принудительно увеличивается на 1, и счётчик снова стартует со случайного значения. Монотонность возрастания счётчика для каждой временной метки гарантируется между всеми одновременно работающими функциями `generateUUIDv7WithCounter`.
+```
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |   counter_high_bits   |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                   counter_low_bits                        |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+```
+**Синтаксис**
+
+``` sql
+generateUUIDv7WithCounter([x])
+```
+
+**Аргументы**
+
+-   `x` — [выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр.
+   
+**Возвращаемое значение**
+
+Значение типа [UUID](../../sql-reference/functions/uuid-functions.md).
+
+**Пример использования**
+
+Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUIDv7.
+
+``` sql
+CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+
+INSERT INTO t_uuid SELECT generateUUIDv7WithCounter()
+
+SELECT * FROM t_uuid
+```
+
+``` text
+┌────────────────────────────────────x─┐
+│ 018f05c7-56e3-7ac3-93e9-1d93c4218e0e │
+└──────────────────────────────────────┘
+```
+
+**Пример использования, для генерации нескольких значений в одной строке**
+
+```sql
+SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(7)
+┌─generateUUIDv7WithCounter(1)─────────┬─generateUUIDv7WithCounter(2)─────────┐
+│ 018f05c9-4ab8-7b86-b64e-c9f03fbd45d1 │ 018f05c9-4ab8-7b86-b64e-c9f12efb7e16 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
+## generateUUIDv7WithFastCounter {#uuidv7withfastcounter-function-generate}
+
+Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Данная функция является ускоренным аналогом функции `generateUUIDv7WithCounter` за счёт потери гарантии монотонности счётчика при одной и той же метке времени между одновременно исполняемыми разными запросами. Монотонность счётчика гарантируется только в пределах одного треда, исполняющего данную функцию для генерации нескольких UUID.
+
+**Синтаксис**
+
+``` sql
+generateUUIDv7WithFastCounter([x])
+```
+
+**Аргументы**
+
+-   `x` — [выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр.
+   
+**Возвращаемое значение**
+
+Значение типа [UUID](../../sql-reference/functions/uuid-functions.md).
+
+**Пример использования**
+
+Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUIDv7.
+
+``` sql
+CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+
+INSERT INTO t_uuid SELECT generateUUIDv7WithFastCounter()
+
+SELECT * FROM t_uuid
+```
+
+``` text
+┌────────────────────────────────────x─┐
+│ 018f05e2-e3b2-70cb-b8be-64b09b626d32 │
+└──────────────────────────────────────┘
+```
+
+**Пример использования, для генерации нескольких значений в одной строке**
+
+```sql
+SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(7)
+┌─generateUUIDv7WithFastCounter(1)─────┬─generateUUIDv7WithFastCounter(2)─────┐
+│ 018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
 ## empty {#empty}
 
 Проверяет, является ли входной UUID пустым.
@@ -259,6 +419,84 @@ SELECT
 └──────────────────┴──────────────────────────────────────┘
 ```
 
+## UUIDToNum {#uuidtonum}
+
+Принимает UUID и возвращает в виде набора байт в [FixedString(16)](../../sql-reference/functions/uuid-functions.md). Также принимает необязательный второй параметр - вариант представления UUID, по умолчанию 1 - `Big-endian` (2 означает представление в формате `Microsoft`). Данная функция заменяет последовательность из двух отдельных функций `UUIDStringToNum(toString(uuid))`, так что промежуточная конвертация из UUID в String для извлечения набора байт из UUID не требуется.
+
+``` sql
+UUIDToNum(UUID[, variant = 1])
+```
+
+**Возвращаемое значение**
+
+FixedString(16)
+
+**Примеры использования**
+
+``` sql
+SELECT
+    toUUID('612f3c40-5d3b-217e-707b-6a546a3d7b29') AS uuid,
+    UUIDToNum(uuid) AS bytes
+```
+
+``` text
+┌─uuid─────────────────────────────────┬─bytes────────────┐
+│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │
+└──────────────────────────────────────┴──────────────────┘
+```
+``` sql
+SELECT
+    toUUID('612f3c40-5d3b-217e-707b-6a546a3d7b29') AS uuid,
+    UUIDToNum(uuid, 2) AS bytes
+```
+
+```text
+┌─uuid─────────────────────────────────┬─bytes────────────┐
+│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ @</a;]~!p{jTj={) │
+└──────────────────────────────────────┴──────────────────┘
+```
+
+## UUIDv7ToDateTime {#uuidv7todatetime}
+
+Принимает UUID версии 7 и извлекает из него временную метку.
+
+``` sql
+UUIDv7ToDateTime(uuid[, timezone])
+```
+
+**Параметры**
+
+- `uuid` — [UUID](../data-types/uuid.md) версии 7.
+- `timezone` — [Часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательный параметр). [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+- Временная метка с миллисекундной точностью (1970-01-01 00:00:00.000 в случае UUID не версии 7).
+
+Type: [DateTime64(3)](/docs/ru/sql-reference/data-types/datetime64.md).
+
+**Примеры использования**
+
+``` sql
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))
+```
+
+```text
+┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))─┐
+│                                          2024-04-22 15:30:29.048 │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+``` sql
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
+```
+
+```text
+┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')─┐
+│                                                              2024-04-22 08:30:29.048 │
+└──────────────────────────────────────────────────────────────────────────────────────┘
+```
+
 ## serverUUID() {#server-uuid}
 
 Возвращает случайный и уникальный UUID, который генерируется при первом запуске сервера и сохраняется навсегда. Результат записывается в файл `uuid`, расположенный в каталоге сервера ClickHouse `/var/lib/clickhouse/`.

From 56e32e0f996d8a0838ef9f89f7a1db0bc5b14a38 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 23 Apr 2024 14:19:56 +0000
Subject: [PATCH 045/192] Rename MergeTreeIndexInverted* to
 MergeTreeIndexFullText*

---
 .../mergetree-family/invertedindexes.md       | 46 ++++++-------
 src/Core/Settings.h                           |  2 +-
 src/Interpreters/GinFilter.cpp                |  2 +-
 .../MergeTreeDataPartWriterOnDisk.cpp         |  8 +--
 .../MergeTree/MergeTreeDataSelectExecutor.cpp |  6 +-
 .../MergeTree/MergeTreeIndexBloomFilter.h     |  5 ++
 ...nverted.cpp => MergeTreeIndexFullText.cpp} | 66 +++++++++----------
 ...dexInverted.h => MergeTreeIndexFullText.h} | 28 ++++----
 src/Storages/MergeTree/MergeTreeIndices.cpp   |  6 +-
 src/Storages/MergeTree/MergeTreeIndices.h     |  4 +-
 src/Storages/MergeTree/MutateTask.cpp         |  4 +-
 ...> 02346_fulltext_index_bug47393.reference} |  0
 ....sql => 02346_fulltext_index_bug47393.sql} |  2 +-
 ...> 02346_fulltext_index_bug52019.reference} |  0
 ....sql => 02346_fulltext_index_bug52019.sql} |  2 +-
 ...> 02346_fulltext_index_bug59039.reference} |  0
 ....sql => 02346_fulltext_index_bug59039.sql} |  2 +-
 ...46_fulltext_index_detach_attach.reference} |  0
 ...=> 02346_fulltext_index_detach_attach.sql} |  2 +-
 ..._fulltext_index_match_predicate.reference} |  0
 ... 02346_fulltext_index_match_predicate.sql} |  2 +-
 ... => 02346_fulltext_index_search.reference} | 20 +++---
 ...ch.sql => 02346_fulltext_index_search.sql} | 64 +++++++++---------
 23 files changed, 138 insertions(+), 133 deletions(-)
 rename src/Storages/MergeTree/{MergeTreeIndexInverted.cpp => MergeTreeIndexFullText.cpp} (93%)
 rename src/Storages/MergeTree/{MergeTreeIndexInverted.h => MergeTreeIndexFullText.h} (87%)
 rename tests/queries/0_stateless/{02346_inverted_index_bug47393.reference => 02346_fulltext_index_bug47393.reference} (100%)
 rename tests/queries/0_stateless/{02346_inverted_index_bug47393.sql => 02346_fulltext_index_bug47393.sql} (93%)
 rename tests/queries/0_stateless/{02346_inverted_index_bug52019.reference => 02346_fulltext_index_bug52019.reference} (100%)
 rename tests/queries/0_stateless/{02346_inverted_index_bug52019.sql => 02346_fulltext_index_bug52019.sql} (91%)
 rename tests/queries/0_stateless/{02346_inverted_index_bug59039.reference => 02346_fulltext_index_bug59039.reference} (100%)
 rename tests/queries/0_stateless/{02346_inverted_index_bug59039.sql => 02346_fulltext_index_bug59039.sql} (93%)
 rename tests/queries/0_stateless/{02346_inverted_index_detach_attach.reference => 02346_fulltext_index_detach_attach.reference} (100%)
 rename tests/queries/0_stateless/{02346_inverted_index_detach_attach.sql => 02346_fulltext_index_detach_attach.sql} (82%)
 rename tests/queries/0_stateless/{02346_inverted_index_match_predicate.reference => 02346_fulltext_index_match_predicate.reference} (100%)
 rename tests/queries/0_stateless/{02346_inverted_index_match_predicate.sql => 02346_fulltext_index_match_predicate.sql} (97%)
 rename tests/queries/0_stateless/{02346_inverted_index_search.reference => 02346_fulltext_index_search.reference} (74%)
 rename tests/queries/0_stateless/{02346_inverted_index_search.sql => 02346_fulltext_index_search.sql} (87%)

diff --git a/docs/en/engines/table-engines/mergetree-family/invertedindexes.md b/docs/en/engines/table-engines/mergetree-family/invertedindexes.md
index 7e5140b4c4f..9374f6a3ac1 100644
--- a/docs/en/engines/table-engines/mergetree-family/invertedindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/invertedindexes.md
@@ -1,19 +1,19 @@
 ---
 slug: /en/engines/table-engines/mergetree-family/invertedindexes
-sidebar_label:  Inverted Indexes
+sidebar_label:  Full-text Indexes
 description: Quickly find search terms in text.
 keywords: [full-text search, text search, inverted, index, indices]
 ---
 
-# Full-text Search using Inverted Indexes [experimental]
+# Full-text Search using Full-text Indexes [experimental]
 
-Inverted indexes are an experimental type of [secondary indexes](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#available-types-of-indices) which provide fast text search
+Full-text indexes are an experimental type of [secondary indexes](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#available-types-of-indices) which provide fast text search
 capabilities for [String](/docs/en/sql-reference/data-types/string.md) or [FixedString](/docs/en/sql-reference/data-types/fixedstring.md)
-columns. The main idea of an inverted index is to store a mapping from "terms" to the rows which contain these terms. "Terms" are
+columns. The main idea of a full-text index is to store a mapping from "terms" to the rows which contain these terms. "Terms" are
 tokenized cells of the string column. For example, the string cell "I will be a little late" is by default tokenized into six terms "I", "will",
 "be", "a", "little" and "late". Another kind of tokenizer is n-grams. For example, the result of 3-gram tokenization will be 21 terms "I w",
 " wi", "wil", "ill", "ll ", "l b", " be" etc. The more fine-granular the input strings are tokenized, the bigger but also the more
-useful the resulting inverted index will be.
+useful the resulting full-text index will be.
 
 <div class='vimeo-container'>
   <iframe src="//www.youtube.com/embed/O_MnyUkrIq8"
@@ -28,26 +28,26 @@ useful the resulting inverted index will be.
 </div>
 
 :::note
-Inverted indexes are experimental and should not be used in production environments yet. They may change in the future in backward-incompatible
+Full-text indexes are experimental and should not be used in production environments yet. They may change in the future in backward-incompatible
 ways, for example with respect to their DDL/DQL syntax or performance/compression characteristics.
 :::
 
 ## Usage
 
-To use inverted indexes, first enable them in the configuration:
+To use full-text indexes, first enable them in the configuration:
 
 ```sql
 SET allow_experimental_inverted_index = true;
 ```
 
-An inverted index can be defined on a string column using the following syntax
+An full-text index can be defined on a string column using the following syntax
 
 ``` sql
 CREATE TABLE tab
 (
     `key` UInt64,
     `str` String,
-    INDEX inv_idx(str) TYPE inverted(0) GRANULARITY 1
+    INDEX inv_idx(str) TYPE full_text(0) GRANULARITY 1
 )
 ENGINE = MergeTree
 ORDER BY key
@@ -55,20 +55,20 @@ ORDER BY key
 
 where `N` specifies the tokenizer:
 
-- `inverted(0)` (or shorter: `inverted()`) set the tokenizer to "tokens", i.e. split strings along spaces,
-- `inverted(N)` with `N` between 2 and 8 sets the tokenizer to "ngrams(N)"
+- `full_text(0)` (or shorter: `full_text()`) set the tokenizer to "tokens", i.e. split strings along spaces,
+- `full_text(N)` with `N` between 2 and 8 sets the tokenizer to "ngrams(N)"
 
 The maximum rows per postings list can be specified as the second parameter. This parameter can be used to control postings list sizes to avoid generating huge postings list files. The following variants exist:
 
-- `inverted(ngrams, max_rows_per_postings_list)`: Use given max_rows_per_postings_list (assuming it is not 0)
-- `inverted(ngrams, 0)`: No limitation of maximum rows per postings list
-- `inverted(ngrams)`: Use a default maximum rows which is 64K.
+- `full_text(ngrams, max_rows_per_postings_list)`: Use given max_rows_per_postings_list (assuming it is not 0)
+- `full_text(ngrams, 0)`: No limitation of maximum rows per postings list
+- `full_text(ngrams)`: Use a default maximum rows which is 64K.
 
-Being a type of skipping index, inverted indexes can be dropped or added to a column after table creation:
+Being a type of skipping index, full-text indexes can be dropped or added to a column after table creation:
 
 ``` sql
 ALTER TABLE tab DROP INDEX inv_idx;
-ALTER TABLE tab ADD INDEX inv_idx(s) TYPE inverted(2);
+ALTER TABLE tab ADD INDEX inv_idx(s) TYPE full_text(2);
 ```
 
 To use the index, no special functions or syntax are required. Typical string search predicates automatically leverage the index. As
@@ -83,9 +83,9 @@ SELECT * from tab WHERE multiSearchAny(str, ['Hello', 'World']);
 SELECT * from tab WHERE hasToken(str, 'Hello');
 ```
 
-The inverted index also works on columns of type `Array(String)`, `Array(FixedString)`, `Map(String)` and `Map(String)`.
+The full-text index also works on columns of type `Array(String)`, `Array(FixedString)`, `Map(String)` and `Map(String)`.
 
-Like for other secondary indices, each column part has its own inverted index. Furthermore, each inverted index is internally divided into
+Like for other secondary indices, each column part has its own full-text index. Furthermore, each full-text index is internally divided into
 "segments". The existence and size of the segments are generally transparent to users but the segment size determines the memory consumption
 during index construction (e.g. when two parts are merged). Configuration parameter "max_digestion_size_per_segment" (default: 256 MB)
 controls the amount of data read consumed from the underlying column before a new segment is created. Incrementing the parameter raises the
@@ -94,7 +94,7 @@ average to evaluate a query.
 
 ## Full-text search of the Hacker News dataset
 
-Let's look at the performance improvements of inverted indexes on a large dataset with lots of text. We will use 28.7M rows of comments on the popular Hacker News website. Here is the table without an inverted index:
+Let's look at the performance improvements of full-text indexes on a large dataset with lots of text. We will use 28.7M rows of comments on the popular Hacker News website. Here is the table without an full-text index:
 
 ```sql
 CREATE TABLE hackernews (
@@ -162,11 +162,11 @@ Notice it takes 3 seconds to execute the query:
 1 row in set. Elapsed: 3.001 sec. Processed 28.74 million rows, 9.75 GB (9.58 million rows/s., 3.25 GB/s.)
 ```
 
-We will use `ALTER TABLE` and add an inverted index on the lowercase of the `comment` column, then materialize it (which can take a while - wait for it to materialize):
+We will use `ALTER TABLE` and add an full-text index on the lowercase of the `comment` column, then materialize it (which can take a while - wait for it to materialize):
 
 ```sql
 ALTER TABLE hackernews
-     ADD INDEX comment_lowercase(lower(comment)) TYPE inverted;
+     ADD INDEX comment_lowercase(lower(comment)) TYPE full_text;
 
 ALTER TABLE hackernews MATERIALIZE INDEX comment_lowercase;
 ```
@@ -204,9 +204,9 @@ WHERE hasToken(lower(comment), 'avx') AND hasToken(lower(comment), 'sve');
 ```
 
 :::note
-Unlike other secondary indices, inverted indexes (for now) map to row numbers (row ids) instead of granule ids. The reason for this design
+Unlike other secondary indices, full-text indexes (for now) map to row numbers (row ids) instead of granule ids. The reason for this design
 is performance. In practice, users often search for multiple terms at once. For example, filter predicate `WHERE s LIKE '%little%' OR s LIKE
-'%big%'` can be evaluated directly using an inverted index by forming the union of the row id lists for terms "little" and "big". This also
+'%big%'` can be evaluated directly using an full-text index by forming the union of the row id lists for terms "little" and "big". This also
 means that the parameter `GRANULARITY` supplied to index creation has no meaning (it may be removed from the syntax in the future).
 :::
 
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 375bdb1c516..f5c96c8cff5 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -233,7 +233,7 @@ class IColumn;
     M(Bool, do_not_merge_across_partitions_select_final, false, "Merge parts only in one partition in select final", 0) \
     M(Bool, split_parts_ranges_into_intersecting_and_non_intersecting_final, true, "Split parts ranges into intersecting and non intersecting during FINAL optimization", 0) \
     M(Bool, split_intersecting_parts_ranges_into_layers_final, true, "Split intersecting parts ranges into layers during FINAL optimization", 0) \
-    M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \
+    M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental fulltext (inverted) index.", 0) \
     \
     M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \
     M(Bool, mysql_map_string_to_text_in_show_columns, true, "If enabled, String type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise. Has an effect only when the connection is made through the MySQL wire protocol.", 0) \
diff --git a/src/Interpreters/GinFilter.cpp b/src/Interpreters/GinFilter.cpp
index 1ce26ed1fd1..dd13a264d96 100644
--- a/src/Interpreters/GinFilter.cpp
+++ b/src/Interpreters/GinFilter.cpp
@@ -8,7 +8,7 @@
 #include <Interpreters/GinFilter.h>
 #include <Storages/MergeTree/GinIndexStore.h>
 #include <Storages/MergeTree/MergeTreeIndexBloomFilterText.h>
-#include <Storages/MergeTree/MergeTreeIndexInverted.h>
+#include <Storages/MergeTree/MergeTreeIndexFullText.h>
 #include <string>
 #include <algorithm>
 #include <city.h>
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
index 5a82357c6eb..441447dcaba 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
@@ -1,5 +1,5 @@
 #include <Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h>
-#include <Storages/MergeTree/MergeTreeIndexInverted.h>
+#include <Storages/MergeTree/MergeTreeIndexFullText.h>
 #include <Common/ElapsedTimeProfileEventIncrement.h>
 #include <Common/MemoryTrackerBlockerInThread.h>
 #include <Common/logger_useful.h>
@@ -283,7 +283,7 @@ void MergeTreeDataPartWriterOnDisk::initSkipIndices()
                         settings.query_write_settings));
 
         GinIndexStorePtr store = nullptr;
-        if (typeid_cast<const MergeTreeIndexInverted *>(&*skip_index) != nullptr)
+        if (typeid_cast<const MergeTreeIndexFullText *>(&*skip_index) != nullptr)
         {
             store = std::make_shared<GinIndexStore>(stream_name, data_part->getDataPartStoragePtr(), data_part->getDataPartStoragePtr(), storage.getSettings()->max_digestion_size_per_segment);
             gin_index_stores[stream_name] = store;
@@ -356,7 +356,7 @@ void MergeTreeDataPartWriterOnDisk::calculateAndSerializeSkipIndices(const Block
         WriteBuffer & marks_out = stream.compress_marks ? stream.marks_compressed_hashing : stream.marks_hashing;
 
         GinIndexStorePtr store;
-        if (typeid_cast<const MergeTreeIndexInverted *>(&*index_helper) != nullptr)
+        if (typeid_cast<const MergeTreeIndexFullText *>(&*index_helper) != nullptr)
         {
             String stream_name = index_helper->getFileName();
             auto it = gin_index_stores.find(stream_name);
@@ -471,7 +471,7 @@ void MergeTreeDataPartWriterOnDisk::fillSkipIndicesChecksums(MergeTreeData::Data
         /// Register additional files written only by the inverted index. Required because otherwise DROP TABLE complains about unknown
         /// files. Note that the provided actual checksums are bogus. The problem is that at this point the file writes happened already and
         /// we'd need to re-open + hash the files (fixing this is TODO). For now, CHECK TABLE skips these four files.
-        if (typeid_cast<const MergeTreeIndexInverted *>(&*skip_indices[i]) != nullptr)
+        if (typeid_cast<const MergeTreeIndexFullText *>(&*skip_indices[i]) != nullptr)
         {
             String filename_without_extension = skip_indices[i]->getFileName();
             checksums.files[filename_without_extension + ".gin_dict"] = MergeTreeDataPartChecksums::Checksum();
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 345872efddf..4b3839618a1 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -9,7 +9,7 @@
 #include <Storages/MergeTree/KeyCondition.h>
 #include <Storages/MergeTree/MergeTreeDataPartUUID.h>
 #include <Storages/MergeTree/StorageFromMergeTreeDataPart.h>
-#include <Storages/MergeTree/MergeTreeIndexInverted.h>
+#include <Storages/MergeTree/MergeTreeIndexFullText.h>
 #include <Storages/ReadInOrderOptimizer.h>
 #include <Storages/VirtualColumnUtils.h>
 #include <Parsers/ASTIdentifier.h>
@@ -1295,7 +1295,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
 
     PostingsCacheForStore cache_in_store;
 
-    if (dynamic_cast<const MergeTreeIndexInverted *>(&*index_helper) != nullptr)
+    if (dynamic_cast<const MergeTreeIndexFullText *>(&*index_helper) != nullptr)
         cache_in_store.store = GinIndexStoreFactory::instance().get(index_helper->getFileName(), part->getDataPartStoragePtr());
 
     for (size_t i = 0; i < ranges.size(); ++i)
@@ -1332,7 +1332,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
             }
 
             bool result = false;
-            const auto * gin_filter_condition = dynamic_cast<const MergeTreeConditionInverted *>(&*condition);
+            const auto * gin_filter_condition = dynamic_cast<const MergeTreeConditionFullText *>(&*condition);
             if (!gin_filter_condition)
                 result = condition->mayBeTrueOnGranule(granule);
             else
diff --git a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h
index eeaa938551c..d66c4b8b6ca 100644
--- a/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h
+++ b/src/Storages/MergeTree/MergeTreeIndexBloomFilter.h
@@ -9,6 +9,11 @@
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
 class MergeTreeIndexGranuleBloomFilter final : public IMergeTreeIndexGranule
 {
 public:
diff --git a/src/Storages/MergeTree/MergeTreeIndexInverted.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp
similarity index 93%
rename from src/Storages/MergeTree/MergeTreeIndexInverted.cpp
rename to src/Storages/MergeTree/MergeTreeIndexFullText.cpp
index baf5b0bf4de..9535cf18127 100644
--- a/src/Storages/MergeTree/MergeTreeIndexInverted.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp
@@ -1,4 +1,4 @@
-#include <Storages/MergeTree/MergeTreeIndexInverted.h>
+#include <Storages/MergeTree/MergeTreeIndexFullText.h>
 
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnLowCardinality.h>
@@ -34,7 +34,7 @@ namespace ErrorCodes
     extern const int INCORRECT_QUERY;
 }
 
-MergeTreeIndexGranuleInverted::MergeTreeIndexGranuleInverted(
+MergeTreeIndexGranuleFullText::MergeTreeIndexGranuleFullText(
     const String & index_name_,
     size_t columns_number,
     const GinFilterParameters & params_)
@@ -45,7 +45,7 @@ MergeTreeIndexGranuleInverted::MergeTreeIndexGranuleInverted(
 {
 }
 
-void MergeTreeIndexGranuleInverted::serializeBinary(WriteBuffer & ostr) const
+void MergeTreeIndexGranuleFullText::serializeBinary(WriteBuffer & ostr) const
 {
     if (empty())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty fulltext index {}.", backQuote(index_name));
@@ -61,7 +61,7 @@ void MergeTreeIndexGranuleInverted::serializeBinary(WriteBuffer & ostr) const
     }
 }
 
-void MergeTreeIndexGranuleInverted::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
+void MergeTreeIndexGranuleFullText::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version)
 {
     if (version != 1)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version);
@@ -85,7 +85,7 @@ void MergeTreeIndexGranuleInverted::deserializeBinary(ReadBuffer & istr, MergeTr
 }
 
 
-MergeTreeIndexAggregatorInverted::MergeTreeIndexAggregatorInverted(
+MergeTreeIndexAggregatorFullText::MergeTreeIndexAggregatorFullText(
     GinIndexStorePtr store_,
     const Names & index_columns_,
     const String & index_name_,
@@ -97,20 +97,20 @@ MergeTreeIndexAggregatorInverted::MergeTreeIndexAggregatorInverted(
     , params(params_)
     , token_extractor(token_extractor_)
     , granule(
-        std::make_shared<MergeTreeIndexGranuleInverted>(
+        std::make_shared<MergeTreeIndexGranuleFullText>(
             index_name, index_columns.size(), params))
 {
 }
 
-MergeTreeIndexGranulePtr MergeTreeIndexAggregatorInverted::getGranuleAndReset()
+MergeTreeIndexGranulePtr MergeTreeIndexAggregatorFullText::getGranuleAndReset()
 {
-    auto new_granule = std::make_shared<MergeTreeIndexGranuleInverted>(
+    auto new_granule = std::make_shared<MergeTreeIndexGranuleFullText>(
         index_name, index_columns.size(), params);
     new_granule.swap(granule);
     return new_granule;
 }
 
-void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter)
+void MergeTreeIndexAggregatorFullText::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter)
 {
     size_t cur = 0;
     size_t token_start = 0;
@@ -120,7 +120,7 @@ void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char *
         gin_filter.add(data + token_start, token_len, rowID, store);
 }
 
-void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit)
+void MergeTreeIndexAggregatorFullText::update(const Block & block, size_t * pos, size_t limit)
 {
     if (*pos >= block.rows())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "The provided position is not less than the number of block rows. "
@@ -184,7 +184,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
     *pos += rows_read;
 }
 
-MergeTreeConditionInverted::MergeTreeConditionInverted(
+MergeTreeConditionFullText::MergeTreeConditionFullText(
     const ActionsDAGPtr & filter_actions_dag,
     ContextPtr context_,
     const Block & index_sample_block,
@@ -210,7 +210,7 @@ MergeTreeConditionInverted::MergeTreeConditionInverted(
 }
 
 /// Keep in-sync with MergeTreeConditionFullText::alwaysUnknownOrTrue
-bool MergeTreeConditionInverted::alwaysUnknownOrTrue() const
+bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
 {
     /// Check like in KeyCondition.
     std::vector<bool> rpn_stack;
@@ -258,10 +258,10 @@ bool MergeTreeConditionInverted::alwaysUnknownOrTrue() const
     return rpn_stack[0];
 }
 
-bool MergeTreeConditionInverted::mayBeTrueOnGranuleInPart(MergeTreeIndexGranulePtr idx_granule,[[maybe_unused]] PostingsCacheForStore & cache_store) const
+bool MergeTreeConditionFullText::mayBeTrueOnGranuleInPart(MergeTreeIndexGranulePtr idx_granule,[[maybe_unused]] PostingsCacheForStore & cache_store) const
 {
-    std::shared_ptr<MergeTreeIndexGranuleInverted> granule
-            = std::dynamic_pointer_cast<MergeTreeIndexGranuleInverted>(idx_granule);
+    std::shared_ptr<MergeTreeIndexGranuleFullText> granule
+            = std::dynamic_pointer_cast<MergeTreeIndexGranuleFullText>(idx_granule);
     if (!granule)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "GinFilter index condition got a granule with the wrong type.");
 
@@ -367,7 +367,7 @@ bool MergeTreeConditionInverted::mayBeTrueOnGranuleInPart(MergeTreeIndexGranuleP
     return rpn_stack[0].can_be_true;
 }
 
-bool MergeTreeConditionInverted::traverseAtomAST(const RPNBuilderTreeNode & node, RPNElement & out)
+bool MergeTreeConditionFullText::traverseAtomAST(const RPNBuilderTreeNode & node, RPNElement & out)
 {
     {
         Field const_value;
@@ -455,7 +455,7 @@ bool MergeTreeConditionInverted::traverseAtomAST(const RPNBuilderTreeNode & node
     return false;
 }
 
-bool MergeTreeConditionInverted::traverseASTEquals(
+bool MergeTreeConditionFullText::traverseASTEquals(
     const String & function_name,
     const RPNBuilderTreeNode & key_ast,
     const DataTypePtr & value_type,
@@ -666,7 +666,7 @@ bool MergeTreeConditionInverted::traverseASTEquals(
     return false;
 }
 
-bool MergeTreeConditionInverted::tryPrepareSetGinFilter(
+bool MergeTreeConditionFullText::tryPrepareSetGinFilter(
     const RPNBuilderTreeNode & lhs,
     const RPNBuilderTreeNode & rhs,
     RPNElement & out)
@@ -739,30 +739,30 @@ bool MergeTreeConditionInverted::tryPrepareSetGinFilter(
     return true;
 }
 
-MergeTreeIndexGranulePtr MergeTreeIndexInverted::createIndexGranule() const
+MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const
 {
-    return std::make_shared<MergeTreeIndexGranuleInverted>(index.name, index.column_names.size(), params);
+    return std::make_shared<MergeTreeIndexGranuleFullText>(index.name, index.column_names.size(), params);
 }
 
-MergeTreeIndexAggregatorPtr MergeTreeIndexInverted::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
+MergeTreeIndexAggregatorPtr MergeTreeIndexFullText::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const
 {
     /// should not be called: createIndexAggregatorForPart should be used
     assert(false);
     return nullptr;
 }
 
-MergeTreeIndexAggregatorPtr MergeTreeIndexInverted::createIndexAggregatorForPart(const GinIndexStorePtr & store, const MergeTreeWriterSettings & /*settings*/) const
+MergeTreeIndexAggregatorPtr MergeTreeIndexFullText::createIndexAggregatorForPart(const GinIndexStorePtr & store, const MergeTreeWriterSettings & /*settings*/) const
 {
-    return std::make_shared<MergeTreeIndexAggregatorInverted>(store, index.column_names, index.name, params, token_extractor.get());
+    return std::make_shared<MergeTreeIndexAggregatorFullText>(store, index.column_names, index.name, params, token_extractor.get());
 }
 
-MergeTreeIndexConditionPtr MergeTreeIndexInverted::createIndexCondition(
+MergeTreeIndexConditionPtr MergeTreeIndexFullText::createIndexCondition(
         const ActionsDAGPtr & filter_actions_dag, ContextPtr context) const
 {
-    return std::make_shared<MergeTreeConditionInverted>(filter_actions_dag, context, index.sample_block, params, token_extractor.get());
+    return std::make_shared<MergeTreeConditionFullText>(filter_actions_dag, context, index.sample_block, params, token_extractor.get());
 };
 
-MergeTreeIndexPtr invertedIndexCreator(
+MergeTreeIndexPtr fullTextIndexCreator(
     const IndexDescription & index)
 {
     size_t n = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
@@ -773,16 +773,16 @@ MergeTreeIndexPtr invertedIndexCreator(
     if (n > 0)
     {
         auto tokenizer = std::make_unique<NgramTokenExtractor>(n);
-        return std::make_shared<MergeTreeIndexInverted>(index, params, std::move(tokenizer));
+        return std::make_shared<MergeTreeIndexFullText>(index, params, std::move(tokenizer));
     }
     else
     {
         auto tokenizer = std::make_unique<SplitTokenExtractor>();
-        return std::make_shared<MergeTreeIndexInverted>(index, params, std::move(tokenizer));
+        return std::make_shared<MergeTreeIndexFullText>(index, params, std::move(tokenizer));
     }
 }
 
-void invertedIndexValidator(const IndexDescription & index, bool /*attach*/)
+void fullTextIndexValidator(const IndexDescription & index, bool /*attach*/)
 {
     for (const auto & index_data_type : index.data_types)
     {
@@ -800,21 +800,21 @@ void invertedIndexValidator(const IndexDescription & index, bool /*attach*/)
         }
 
         if (!data_type.isString() && !data_type.isFixedString())
-            throw Exception(ErrorCodes::INCORRECT_QUERY, "Inverted index can be used only with `String`, `FixedString`,"
+            throw Exception(ErrorCodes::INCORRECT_QUERY, "Full text index can be used only with `String`, `FixedString`,"
                             "`LowCardinality(String)`, `LowCardinality(FixedString)` "
                             "column or Array with `String` or `FixedString` values column.");
     }
 
     if (index.arguments.size() > 2)
-        throw Exception(ErrorCodes::INCORRECT_QUERY, "Inverted index must have less than two arguments.");
+        throw Exception(ErrorCodes::INCORRECT_QUERY, "Full text index must have less than two arguments.");
 
     if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64)
-        throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer.");
+        throw Exception(ErrorCodes::INCORRECT_QUERY, "The first full text index argument must be positive integer.");
 
     if (index.arguments.size() == 2)
     {
         if (index.arguments[1].getType() != Field::Types::UInt64)
-            throw Exception(ErrorCodes::INCORRECT_QUERY, "The second Inverted index argument must be UInt64");
+            throw Exception(ErrorCodes::INCORRECT_QUERY, "The second full text index argument must be UInt64");
         if (index.arguments[1].get<UInt64>() != UNLIMITED_ROWS_PER_POSTINGS_LIST && index.arguments[1].get<UInt64>() < MIN_ROWS_PER_POSTINGS_LIST)
             throw Exception(ErrorCodes::INCORRECT_QUERY, "The maximum rows per postings list must be no less than {}", MIN_ROWS_PER_POSTINGS_LIST);
     }
diff --git a/src/Storages/MergeTree/MergeTreeIndexInverted.h b/src/Storages/MergeTree/MergeTreeIndexFullText.h
similarity index 87%
rename from src/Storages/MergeTree/MergeTreeIndexInverted.h
rename to src/Storages/MergeTree/MergeTreeIndexFullText.h
index bab4e122aa6..1a5e848e5ac 100644
--- a/src/Storages/MergeTree/MergeTreeIndexInverted.h
+++ b/src/Storages/MergeTree/MergeTreeIndexFullText.h
@@ -9,14 +9,14 @@
 
 namespace DB
 {
-struct MergeTreeIndexGranuleInverted final : public IMergeTreeIndexGranule
+struct MergeTreeIndexGranuleFullText final : public IMergeTreeIndexGranule
 {
-    explicit MergeTreeIndexGranuleInverted(
+    explicit MergeTreeIndexGranuleFullText(
         const String & index_name_,
         size_t columns_number,
         const GinFilterParameters & params_);
 
-    ~MergeTreeIndexGranuleInverted() override = default;
+    ~MergeTreeIndexGranuleFullText() override = default;
 
     void serializeBinary(WriteBuffer & ostr) const override;
     void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override;
@@ -29,18 +29,18 @@ struct MergeTreeIndexGranuleInverted final : public IMergeTreeIndexGranule
     bool has_elems;
 };
 
-using MergeTreeIndexGranuleInvertedPtr = std::shared_ptr<MergeTreeIndexGranuleInverted>;
+using MergeTreeIndexGranuleFullTextPtr = std::shared_ptr<MergeTreeIndexGranuleFullText>;
 
-struct MergeTreeIndexAggregatorInverted final : IMergeTreeIndexAggregator
+struct MergeTreeIndexAggregatorFullText final : IMergeTreeIndexAggregator
 {
-    explicit MergeTreeIndexAggregatorInverted(
+    explicit MergeTreeIndexAggregatorFullText(
         GinIndexStorePtr store_,
         const Names & index_columns_,
         const String & index_name_,
         const GinFilterParameters & params_,
         TokenExtractorPtr token_extractor_);
 
-    ~MergeTreeIndexAggregatorInverted() override = default;
+    ~MergeTreeIndexAggregatorFullText() override = default;
 
     bool empty() const override { return !granule || granule->empty(); }
     MergeTreeIndexGranulePtr getGranuleAndReset() override;
@@ -55,21 +55,21 @@ struct MergeTreeIndexAggregatorInverted final : IMergeTreeIndexAggregator
     const GinFilterParameters params;
     TokenExtractorPtr token_extractor;
 
-    MergeTreeIndexGranuleInvertedPtr granule;
+    MergeTreeIndexGranuleFullTextPtr granule;
 };
 
 
-class MergeTreeConditionInverted final : public IMergeTreeIndexCondition, WithContext
+class MergeTreeConditionFullText final : public IMergeTreeIndexCondition, WithContext
 {
 public:
-    MergeTreeConditionInverted(
+    MergeTreeConditionFullText(
             const ActionsDAGPtr & filter_actions_dag,
             ContextPtr context,
             const Block & index_sample_block,
             const GinFilterParameters & params_,
             TokenExtractorPtr token_extactor_);
 
-    ~MergeTreeConditionInverted() override = default;
+    ~MergeTreeConditionFullText() override = default;
 
     bool alwaysUnknownOrTrue() const override;
     bool mayBeTrueOnGranule([[maybe_unused]]MergeTreeIndexGranulePtr idx_granule) const override
@@ -154,10 +154,10 @@ private:
     PreparedSetsPtr prepared_sets;
 };
 
-class MergeTreeIndexInverted final : public IMergeTreeIndex
+class MergeTreeIndexFullText final : public IMergeTreeIndex
 {
 public:
-    MergeTreeIndexInverted(
+    MergeTreeIndexFullText(
         const IndexDescription & index_,
         const GinFilterParameters & params_,
         std::unique_ptr<ITokenExtractor> && token_extractor_)
@@ -165,7 +165,7 @@ public:
         , params(params_)
         , token_extractor(std::move(token_extractor_)) {}
 
-    ~MergeTreeIndexInverted() override = default;
+    ~MergeTreeIndexFullText() override = default;
 
     MergeTreeIndexGranulePtr createIndexGranule() const override;
     MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override;
diff --git a/src/Storages/MergeTree/MergeTreeIndices.cpp b/src/Storages/MergeTree/MergeTreeIndices.cpp
index be8b4c795f0..8ab7d785892 100644
--- a/src/Storages/MergeTree/MergeTreeIndices.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndices.cpp
@@ -125,8 +125,8 @@ MergeTreeIndexFactory::MergeTreeIndexFactory()
     registerValidator("bloom_filter", bloomFilterIndexValidator);
 
     registerCreator("hypothesis", hypothesisIndexCreator);
-    registerValidator("hypothesis", hypothesisIndexValidator);
 
+    registerValidator("hypothesis", hypothesisIndexValidator);
 #ifdef ENABLE_ANNOY
     registerCreator("annoy", annoyIndexCreator);
     registerValidator("annoy", annoyIndexValidator);
@@ -137,8 +137,8 @@ MergeTreeIndexFactory::MergeTreeIndexFactory()
     registerValidator("usearch", usearchIndexValidator);
 #endif
 
-    registerCreator("inverted", invertedIndexCreator);
-    registerValidator("inverted", invertedIndexValidator);
+    registerCreator("full_text", fullTextIndexCreator);
+    registerValidator("full_text", fullTextIndexValidator);
 
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h
index 900e6b6658c..a9f1fa9378f 100644
--- a/src/Storages/MergeTree/MergeTreeIndices.h
+++ b/src/Storages/MergeTree/MergeTreeIndices.h
@@ -240,7 +240,7 @@ MergeTreeIndexPtr usearchIndexCreator(const IndexDescription& index);
 void usearchIndexValidator(const IndexDescription& index, bool attach);
 #endif
 
-MergeTreeIndexPtr invertedIndexCreator(const IndexDescription& index);
-void invertedIndexValidator(const IndexDescription& index, bool attach);
+MergeTreeIndexPtr fullTextIndexCreator(const IndexDescription& index);
+void fullTextIndexValidator(const IndexDescription& index, bool attach);
 
 }
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index 5e388d6a8ac..e11ff87d1b2 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -22,7 +22,7 @@
 #include <Storages/MergeTree/MergeTreeDataWriter.h>
 #include <Storages/MutationCommands.h>
 #include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
-#include <Storages/MergeTree/MergeTreeIndexInverted.h>
+#include <Storages/MergeTree/MergeTreeIndexFullText.h>
 #include <Storages/MergeTree/MergeTreeVirtualColumns.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeVariant.h>
@@ -653,7 +653,7 @@ static NameSet collectFilesToSkip(
         files_to_skip.insert(index->getFileName() + mrk_extension);
 
         // Skip all inverted index files, for they will be rebuilt
-        if (dynamic_cast<const MergeTreeIndexInverted *>(index.get()))
+        if (dynamic_cast<const MergeTreeIndexFullText *>(index.get()))
         {
             auto index_filename = index->getFileName();
             files_to_skip.insert(index_filename + ".gin_dict");
diff --git a/tests/queries/0_stateless/02346_inverted_index_bug47393.reference b/tests/queries/0_stateless/02346_fulltext_index_bug47393.reference
similarity index 100%
rename from tests/queries/0_stateless/02346_inverted_index_bug47393.reference
rename to tests/queries/0_stateless/02346_fulltext_index_bug47393.reference
diff --git a/tests/queries/0_stateless/02346_inverted_index_bug47393.sql b/tests/queries/0_stateless/02346_fulltext_index_bug47393.sql
similarity index 93%
rename from tests/queries/0_stateless/02346_inverted_index_bug47393.sql
rename to tests/queries/0_stateless/02346_fulltext_index_bug47393.sql
index 166e051b120..c9743dce3aa 100644
--- a/tests/queries/0_stateless/02346_inverted_index_bug47393.sql
+++ b/tests/queries/0_stateless/02346_fulltext_index_bug47393.sql
@@ -5,7 +5,7 @@ CREATE TABLE tab
 (
     id UInt64,
     str String,
-    INDEX idx str TYPE inverted(3) GRANULARITY 1
+    INDEX idx str TYPE full_text(3) GRANULARITY 1
 )
 ENGINE = MergeTree
 ORDER BY tuple()
diff --git a/tests/queries/0_stateless/02346_inverted_index_bug52019.reference b/tests/queries/0_stateless/02346_fulltext_index_bug52019.reference
similarity index 100%
rename from tests/queries/0_stateless/02346_inverted_index_bug52019.reference
rename to tests/queries/0_stateless/02346_fulltext_index_bug52019.reference
diff --git a/tests/queries/0_stateless/02346_inverted_index_bug52019.sql b/tests/queries/0_stateless/02346_fulltext_index_bug52019.sql
similarity index 91%
rename from tests/queries/0_stateless/02346_inverted_index_bug52019.sql
rename to tests/queries/0_stateless/02346_fulltext_index_bug52019.sql
index c61e17d9cea..e9b8088dcc1 100644
--- a/tests/queries/0_stateless/02346_inverted_index_bug52019.sql
+++ b/tests/queries/0_stateless/02346_fulltext_index_bug52019.sql
@@ -7,7 +7,7 @@ DROP TABLE IF EXISTS tab;
 CREATE TABLE tab (
     k UInt64,
     s Map(String, String),
-    INDEX idx mapKeys(s) TYPE inverted(2) GRANULARITY 1)
+    INDEX idx mapKeys(s) TYPE full_text(2) GRANULARITY 1)
 ENGINE = MergeTree
 ORDER BY k
 SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.reference b/tests/queries/0_stateless/02346_fulltext_index_bug59039.reference
similarity index 100%
rename from tests/queries/0_stateless/02346_inverted_index_bug59039.reference
rename to tests/queries/0_stateless/02346_fulltext_index_bug59039.reference
diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.sql b/tests/queries/0_stateless/02346_fulltext_index_bug59039.sql
similarity index 93%
rename from tests/queries/0_stateless/02346_inverted_index_bug59039.sql
rename to tests/queries/0_stateless/02346_fulltext_index_bug59039.sql
index 0ef0cb0c733..e94d25d8d9e 100644
--- a/tests/queries/0_stateless/02346_inverted_index_bug59039.sql
+++ b/tests/queries/0_stateless/02346_fulltext_index_bug59039.sql
@@ -9,7 +9,7 @@ CREATE TABLE tab
 (
     id UInt64,
     doc String,
-    INDEX text_idx doc TYPE inverted
+    INDEX text_idx doc TYPE full_text
 )
 ENGINE = MergeTree
 ORDER BY id
diff --git a/tests/queries/0_stateless/02346_inverted_index_detach_attach.reference b/tests/queries/0_stateless/02346_fulltext_index_detach_attach.reference
similarity index 100%
rename from tests/queries/0_stateless/02346_inverted_index_detach_attach.reference
rename to tests/queries/0_stateless/02346_fulltext_index_detach_attach.reference
diff --git a/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql b/tests/queries/0_stateless/02346_fulltext_index_detach_attach.sql
similarity index 82%
rename from tests/queries/0_stateless/02346_inverted_index_detach_attach.sql
rename to tests/queries/0_stateless/02346_fulltext_index_detach_attach.sql
index 762d78922fe..20d7c5f5477 100644
--- a/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql
+++ b/tests/queries/0_stateless/02346_fulltext_index_detach_attach.sql
@@ -4,7 +4,7 @@ CREATE TABLE t
 (
     key UInt64,
     str String,
-    INDEX inv_idx str TYPE inverted(0) GRANULARITY 1
+    INDEX inv_idx str TYPE full_text(0) GRANULARITY 1
 )
 ENGINE = MergeTree
 ORDER BY key;
diff --git a/tests/queries/0_stateless/02346_inverted_index_match_predicate.reference b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference
similarity index 100%
rename from tests/queries/0_stateless/02346_inverted_index_match_predicate.reference
rename to tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference
diff --git a/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql
similarity index 97%
rename from tests/queries/0_stateless/02346_inverted_index_match_predicate.sql
rename to tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql
index 99405c0acf2..2233c8a1f31 100644
--- a/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql
+++ b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql
@@ -8,7 +8,7 @@ CREATE TABLE tab
 (
     id UInt32,
     str String,
-    INDEX inv_idx(str) TYPE inverted(0) GRANULARITY 1
+    INDEX inv_idx(str) TYPE full_text(0) GRANULARITY 1
 )
 ENGINE = MergeTree
 ORDER BY id
diff --git a/tests/queries/0_stateless/02346_inverted_index_search.reference b/tests/queries/0_stateless/02346_fulltext_index_search.reference
similarity index 74%
rename from tests/queries/0_stateless/02346_inverted_index_search.reference
rename to tests/queries/0_stateless/02346_fulltext_index_search.reference
index 73fb0d04157..d742bbc77ec 100644
--- a/tests/queries/0_stateless/02346_inverted_index_search.reference
+++ b/tests/queries/0_stateless/02346_fulltext_index_search.reference
@@ -1,5 +1,5 @@
-Test inverted(2)
-af	inverted
+Test full_text(2)
+af	full_text
 1
 101	Alick a01
 1
@@ -11,8 +11,8 @@ af	inverted
 113	Click b03
 118	Click b08
 1
-Test inverted()
-af	inverted
+Test full_text()
+af	full_text
 101	Alick a01
 106	Alick a06
 111	Alick b01
@@ -24,11 +24,11 @@ af	inverted
 111	Alick b01
 1
 Test on array columns
-af	inverted
+af	full_text
 3	['Click a03','Click b03']
 1
 Test on map columns
-af	inverted
+af	full_text
 103	{'Click':'Click a03'}
 108	{'Click':'Click a08'}
 113	{'Click':'Click b03'}
@@ -36,14 +36,14 @@ af	inverted
 1
 103	{'Click':'Click a03'}
 1
-Test inverted(2) on a column with two parts
-af	inverted
+Test full_text(2) on a column with two parts
+af	full_text
 101	Alick a01
 111	Alick b01
 201	rick c01
 1
-Test inverted(2) on UTF-8 data
-af	inverted
+Test full_text(2) on UTF-8 data
+af	full_text
 102	clickhouse你好
 1
 AST Fuzzer crash, issue #54541
diff --git a/tests/queries/0_stateless/02346_inverted_index_search.sql b/tests/queries/0_stateless/02346_fulltext_index_search.sql
similarity index 87%
rename from tests/queries/0_stateless/02346_inverted_index_search.sql
rename to tests/queries/0_stateless/02346_fulltext_index_search.sql
index 052703dceaf..3c172bfdaf7 100644
--- a/tests/queries/0_stateless/02346_inverted_index_search.sql
+++ b/tests/queries/0_stateless/02346_fulltext_index_search.sql
@@ -3,23 +3,23 @@ SET log_queries = 1;
 SET merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability = 0.0;
 
 ----------------------------------------------------
-SELECT 'Test inverted(2)';
+SELECT 'Test full_text(2)';
 
 DROP TABLE IF EXISTS tab;
 
-CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(2))
+CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE full_text(2))
             ENGINE = MergeTree() ORDER BY k
             SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
 
 INSERT INTO tab VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick a10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10');
 
--- check inverted index was created
+-- check full_text index was created
 SELECT name, type FROM system.data_skipping_indices WHERE table =='tab' AND database = currentDatabase() LIMIT 1;
 
 -- throw in a random consistency check
 CHECK TABLE tab;
 
--- search inverted index with ==
+-- search full_text index with ==
 SELECT * FROM tab WHERE s == 'Alick a01';
 
 -- check the query only read 1 granules (2 rows total; each granule has 2 rows)
@@ -32,7 +32,7 @@ SELECT read_rows==2 from system.query_log
             AND result_rows==1
         LIMIT 1;
 
--- search inverted index with LIKE
+-- search full_text index with LIKE
 SELECT * FROM tab WHERE s LIKE '%01%' ORDER BY k;
 
 -- check the query only read 2 granules (4 rows total; each granule has 2 rows)
@@ -45,7 +45,7 @@ SELECT read_rows==4 from system.query_log
             AND result_rows==2
         LIMIT 1;
 
--- search inverted index with hasToken
+-- search full_text index with hasToken
 SELECT * FROM tab WHERE hasToken(s, 'Click') ORDER BY k;
 
 -- check the query only read 4 granules (8 rows total; each granule has 2 rows)
@@ -59,20 +59,20 @@ SELECT read_rows==8 from system.query_log
         LIMIT 1;
 
 ----------------------------------------------------
-SELECT 'Test inverted()';
+SELECT 'Test full_text()';
 
 DROP TABLE IF EXISTS tab_x;
 
-CREATE TABLE tab_x(k UInt64, s String, INDEX af(s) TYPE inverted())
+CREATE TABLE tab_x(k UInt64, s String, INDEX af(s) TYPE full_text())
     ENGINE = MergeTree() ORDER BY k
     SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
 
 INSERT INTO tab_x VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick a10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10');
 
--- check inverted index was created
+-- check full_text index was created
 SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab_x' AND database = currentDatabase() LIMIT 1;
 
--- search inverted index with hasToken
+-- search full_text index with hasToken
 SELECT * FROM tab_x WHERE hasToken(s, 'Alick') ORDER BY k;
 
 -- check the query only read 4 granules (8 rows total; each granule has 2 rows)
@@ -85,7 +85,7 @@ SELECT read_rows==8 from system.query_log
         AND result_rows==4
     LIMIT 1;
 
--- search inverted index with IN operator
+-- search full_text index with IN operator
 SELECT * FROM tab_x WHERE s IN ('Alick a01', 'Alick a06') ORDER BY k;
 
 -- check the query only read 2 granules (4 rows total; each granule has 2 rows)
@@ -98,7 +98,7 @@ SELECT read_rows==4 from system.query_log
         AND result_rows==2
     LIMIT 1;
 
--- search inverted index with multiSearch
+-- search full_text index with multiSearch
 SELECT * FROM tab_x WHERE multiSearchAny(s, ['a01', 'b01']) ORDER BY k;
 
 -- check the query only read 2 granules (4 rows total; each granule has 2 rows)
@@ -116,16 +116,16 @@ SELECT 'Test on array columns';
 
 DROP TABLE IF EXISTS tab;
 
-create table tab (k UInt64, s Array(String), INDEX af(s) TYPE inverted(2))
+create table tab (k UInt64, s Array(String), INDEX af(s) TYPE full_text(2))
     ENGINE = MergeTree() ORDER BY k
     SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
 
 INSERT INTO tab SELECT rowNumberInBlock(), groupArray(s) FROM tab_x GROUP BY k%10;
 
--- check inverted index was created
+-- check full_text index was created
 SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
 
--- search inverted index with has
+-- search full_text index with has
 SELECT * FROM tab WHERE has(s, 'Click a03') ORDER BY k;
 
 -- check the query must read all 10 granules (20 rows total; each granule has 2 rows)
@@ -143,16 +143,16 @@ SELECT 'Test on map columns';
 
 DROP TABLE IF EXISTS tab;
 
-CREATE TABLE tab (k UInt64, s Map(String,String), INDEX af(mapKeys(s)) TYPE inverted(2))
+CREATE TABLE tab (k UInt64, s Map(String,String), INDEX af(mapKeys(s)) TYPE full_text(2))
     ENGINE = MergeTree() ORDER BY k
     SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
 
 INSERT INTO tab VALUES (101, {'Alick':'Alick a01'}), (102, {'Blick':'Blick a02'}), (103, {'Click':'Click a03'}), (104, {'Dlick':'Dlick a04'}), (105, {'Elick':'Elick a05'}), (106, {'Alick':'Alick a06'}), (107, {'Blick':'Blick a07'}), (108, {'Click':'Click a08'}), (109, {'Dlick':'Dlick a09'}), (110, {'Elick':'Elick a10'}), (111, {'Alick':'Alick b01'}), (112, {'Blick':'Blick b02'}), (113, {'Click':'Click b03'}), (114, {'Dlick':'Dlick b04'}), (115, {'Elick':'Elick b05'}), (116, {'Alick':'Alick b06'}), (117, {'Blick':'Blick b07'}), (118, {'Click':'Click b08'}), (119, {'Dlick':'Dlick b09'}), (120, {'Elick':'Elick b10'});
 
--- check inverted index was created
+-- check full_text index was created
 SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
 
--- search inverted index with mapContains
+-- search full_text index with mapContains
 SELECT * FROM tab WHERE mapContains(s, 'Click') ORDER BY k;
 
 -- check the query must read all 4 granules (8 rows total; each granule has 2 rows)
@@ -165,7 +165,7 @@ SELECT read_rows==8 from system.query_log
         AND result_rows==4
     LIMIT 1;
 
--- search inverted index with map key
+-- search full_text index with map key
 SELECT * FROM tab WHERE s['Click'] = 'Click a03';
 
 -- check the query must read all 4 granules (8 rows total; each granule has 2 rows)
@@ -179,22 +179,22 @@ SELECT read_rows==8 from system.query_log
     LIMIT 1;
 
 ----------------------------------------------------
-SELECT 'Test inverted(2) on a column with two parts';
+SELECT 'Test full_text(2) on a column with two parts';
 
 
 DROP TABLE IF EXISTS tab;
 
-CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(2))
+CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE full_text(2))
     ENGINE = MergeTree() ORDER BY k
     SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
 
 INSERT INTO tab VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick b10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10');
 INSERT INTO tab VALUES (201, 'rick c01'), (202, 'mick c02'), (203, 'nick c03');
 
--- check inverted index was created
+-- check full_text index was created
 SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
 
--- search inverted index
+-- search full_text index
 SELECT * FROM tab WHERE s LIKE '%01%' ORDER BY k;
 
 -- check the query only read 3 granules (6 rows total; each granule has 2 rows)
@@ -208,21 +208,21 @@ SELECT read_rows==6 from system.query_log
     LIMIT 1;
 
 ----------------------------------------------------
-SELECT 'Test inverted(2) on UTF-8 data';
+SELECT 'Test full_text(2) on UTF-8 data';
 
 DROP TABLE IF EXISTS tab;
 
-CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(2))
+CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE full_text(2))
     ENGINE = MergeTree()
     ORDER BY k
     SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi';
 
 INSERT INTO tab VALUES (101, 'Alick 好'), (102, 'clickhouse你好'), (103, 'Click 你'), (104, 'Dlick 你a好'), (105, 'Elick 好好你你'), (106, 'Alick 好a好a你a你');
 
--- check inverted index was created
+-- check full_text index was created
 SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1;
 
--- search inverted index
+-- search full_text index
 SELECT * FROM tab WHERE s LIKE '%你好%' ORDER BY k;
 
 -- check the query only read 1 granule (2 rows total; each granule has 2 rows)
@@ -240,14 +240,14 @@ SELECT read_rows==2 from system.query_log
 SELECT 'AST Fuzzer crash, issue #54541';
 
 DROP TABLE IF EXISTS tab;
-CREATE TABLE tab (row_id UInt32, str String, INDEX idx str TYPE inverted) ENGINE = MergeTree ORDER BY row_id;
+CREATE TABLE tab (row_id UInt32, str String, INDEX idx str TYPE full_text) ENGINE = MergeTree ORDER BY row_id;
 INSERT INTO tab VALUES (0, 'a');
 SELECT * FROM tab WHERE str == 'b' AND 1.0;
 
 SELECT 'Test max_rows_per_postings_list';
 DROP TABLE IF EXISTS tab;
--- create table 'tab' with inverted index parameter (ngrams, max_rows_per_most_list) which is (0, 10240)
-CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 12040))
+-- create table 'tab' with full_text index parameter (ngrams, max_rows_per_most_list) which is (0, 10240)
+CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE full_text(0, 12040))
                      Engine=MergeTree
                      ORDER BY (k)
                      AS
@@ -257,9 +257,9 @@ CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(0, 12040))
                          FROM numbers(1024);
 SELECT count(s) FROM tab WHERE hasToken(s, '4C4B4B4B4B4B5040');
 DROP TABLE IF EXISTS tab;
--- create table 'tab' with inverted index parameter (ngrams, max_rows_per_most_list) which is (0, 123)
+-- create table 'tab' with full_text index parameter (ngrams, max_rows_per_most_list) which is (0, 123)
 -- it should throw exception since max_rows_per_most_list(123) is less than its minimum value(8196)
-CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE inverted(3, 123))
+CREATE TABLE tab(k UInt64, s String, INDEX af(s) TYPE full_text(3, 123))
                      Engine=MergeTree
                      ORDER BY (k)
                      AS

From 2510fc85b0920a89e16b9fd19beffa570d6028c2 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 24 Apr 2024 10:01:24 +0000
Subject: [PATCH 046/192] Recursively check cgroups.controllers files for
 memory controllers

---
 base/base/cgroupsv2.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/base/base/cgroupsv2.cpp b/base/base/cgroupsv2.cpp
index 1686c6bd88c..c8081d986a9 100644
--- a/base/base/cgroupsv2.cpp
+++ b/base/base/cgroupsv2.cpp
@@ -23,18 +23,19 @@ bool cgroupsV2MemoryControllerEnabled()
 {
 #if defined(OS_LINUX)
     chassert(cgroupsV2Enabled());
-    /// According to https://docs.kernel.org/admin-guide/cgroup-v2.html:
-    /// - file 'cgroup.controllers' defines which controllers *can* be enabled
-    /// - file 'cgroup.subtree_control' defines which controllers *are* enabled
-    /// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
-    std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
-    if (!subtree_control_file.is_open())
+    /// According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines
+    /// which controllers are enabled for the current cgroup. Check the bottom-most nested "cgroup.controllers"
+    /// file.
+    std::string cgroup = cgroupV2OfProcess();
+    auto cgroup_dir = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup);
+    std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
+    if (!controllers_file.is_open())
         return false;
     std::string controllers;
-    std::getline(subtree_control_file, controllers);
-    if (controllers.find("memory") == std::string::npos)
-        return false;
-    return true;
+    std::getline(controllers_file, controllers);
+    if (controllers.find("memory") != std::string::npos)
+        return true;
+    return false;
 #else
     return false;
 #endif

From 1ba7e5942b9cf9ad2fc57593edda2098220ce3c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Wed, 24 Apr 2024 19:24:36 +0200
Subject: [PATCH 047/192] Enable optimize_rewrite_sum_if_to_count_if by default

---
 src/Core/Settings.h                           |   2 +-
 src/Core/SettingsChangesHistory.h             |   1 +
 .../RewriteSumIfFunctionVisitor.cpp           | 121 ------------------
 .../RewriteSumIfFunctionVisitor.h             |  33 -----
 src/Interpreters/TreeOptimizer.cpp            |  10 --
 5 files changed, 2 insertions(+), 165 deletions(-)
 delete mode 100644 src/Interpreters/RewriteSumIfFunctionVisitor.cpp
 delete mode 100644 src/Interpreters/RewriteSumIfFunctionVisitor.h

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 375bdb1c516..28d5f7de8e3 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -681,7 +681,7 @@ class IColumn;
     M(Bool, query_cache_share_between_users, false, "Allow other users to read entry in the query cache", 0) \
     M(Bool, enable_sharing_sets_for_mutations, true, "Allow sharing set objects build for IN subqueries between different tasks of the same mutation. This reduces memory usage and CPU consumption", 0) \
     \
-    M(Bool, optimize_rewrite_sum_if_to_count_if, false, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
+    M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
     M(Bool, optimize_rewrite_aggregate_function_with_if, true, "Rewrite aggregate functions with if expression as argument when logically equivalent. For example, avg(if(cond, col, null)) can be rewritten to avgIf(cond, col)", 0) \
     M(Bool, optimize_rewrite_array_exists_to_has, false, "Rewrite arrayExists() functions to has() when logically equivalent. For example, arrayExists(x -> x = 1, arr) can be rewritten to has(arr, 1)", 0) \
     M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index d3b5de06e70..cc619124fcb 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -93,6 +93,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
               {"first_day_of_week", "Monday", "Monday", "Added a setting for the first day of the week for date/time functions"},
               {"temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds", (10 * 60 * 1000), (10 * 60 * 1000), "Wait time to lock cache for sapce reservation in temporary data in filesystem cache"},
+              {"optimize_rewrite_sum_if_to_count_if", false, true, "Only available for the analyzer, where it works correctly"}
     }},
     {"24.3", {{"s3_connect_timeout_ms", 1000, 1000, "Introduce new dedicated setting for s3 connection timeout"},
               {"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"},
diff --git a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp b/src/Interpreters/RewriteSumIfFunctionVisitor.cpp
deleted file mode 100644
index c598fc1a6b7..00000000000
--- a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <Interpreters/RewriteSumIfFunctionVisitor.h>
-#include <Parsers/ASTFunction.h>
-#include <Parsers/ASTLiteral.h>
-#include <Common/typeid_cast.h>
-
-namespace DB
-{
-
-void RewriteSumIfFunctionMatcher::visit(ASTPtr & ast, Data & data)
-{
-    if (auto * func = ast->as<ASTFunction>())
-    {
-        if (func->is_window_function)
-            return;
-
-        visit(*func, ast, data);
-    }
-}
-
-void RewriteSumIfFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast, Data &)
-{
-    if (!func.arguments || func.arguments->children.empty())
-        return;
-
-    auto lower_name = Poco::toLower(func.name);
-
-    /// sumIf, SumIf or sUMIf are valid function names, but sumIF or sumiF are not
-    if (lower_name != "sum" && (lower_name != "sumif" || !endsWith(func.name, "If")))
-        return;
-
-    const auto & func_arguments = func.arguments->children;
-
-    if (lower_name == "sumif")
-    {
-        const auto * literal = func_arguments[0]->as<ASTLiteral>();
-        if (!literal || !DB::isInt64OrUInt64FieldType(literal->value.getType()))
-            return;
-
-        if (func_arguments.size() == 2)
-        {
-            std::shared_ptr<ASTFunction> new_func;
-            if (literal->value.get<UInt64>() == 1)
-            {
-                /// sumIf(1, cond) -> countIf(cond)
-                new_func = makeASTFunction("countIf", func_arguments[1]);
-            }
-            else
-            {
-                /// sumIf(123, cond) -> 123 * countIf(cond)
-                auto count_if_func = makeASTFunction("countIf", func_arguments[1]);
-                new_func = makeASTFunction("multiply", func_arguments[0], std::move(count_if_func));
-            }
-            new_func->setAlias(func.alias);
-            ast = std::move(new_func);
-            return;
-        }
-    }
-    else
-    {
-        const auto * nested_func = func_arguments[0]->as<ASTFunction>();
-
-        if (!nested_func || Poco::toLower(nested_func->name) != "if" || nested_func->arguments->children.size() != 3)
-            return;
-
-        const auto & if_arguments = nested_func->arguments->children;
-
-        const auto * first_literal = if_arguments[1]->as<ASTLiteral>();
-        const auto * second_literal = if_arguments[2]->as<ASTLiteral>();
-
-        if (first_literal && second_literal)
-        {
-            if (!DB::isInt64OrUInt64FieldType(first_literal->value.getType()) || !DB::isInt64OrUInt64FieldType(second_literal->value.getType()))
-                return;
-
-            auto first_value = first_literal->value.get<UInt64>();
-            auto second_value = second_literal->value.get<UInt64>();
-
-            std::shared_ptr<ASTFunction> new_func;
-            if (second_value == 0)
-            {
-                if (first_value == 1)
-                {
-                    /// sum(if(cond, 1, 0)) -> countIf(cond)
-                    new_func = makeASTFunction("countIf", if_arguments[0]);
-                }
-                else
-                {
-                    /// sum(if(cond, 123, 0)) -> 123 * countIf(cond)
-                    auto count_if_func = makeASTFunction("countIf", if_arguments[0]);
-                    new_func = makeASTFunction("multiply", if_arguments[1], std::move(count_if_func));
-                }
-                new_func->setAlias(func.alias);
-                ast = std::move(new_func);
-                return;
-            }
-
-            if (first_value == 0)
-            {
-                auto not_func = makeASTFunction("not", if_arguments[0]);
-                if (second_value == 1)
-                {
-                    /// sum(if(cond, 0, 1)) -> countIf(not(cond))
-                    new_func = makeASTFunction("countIf", std::move(not_func));
-                }
-                else
-                {
-                    /// sum(if(cond, 0, 123)) -> 123 * countIf(not(cond))
-                    auto count_if_func = makeASTFunction("countIf", std::move(not_func));
-                    new_func = makeASTFunction("multiply", if_arguments[2], std::move(count_if_func));
-                }
-                new_func->setAlias(func.alias);
-                ast = std::move(new_func);
-                return;
-            }
-        }
-    }
-
-}
-
-}
diff --git a/src/Interpreters/RewriteSumIfFunctionVisitor.h b/src/Interpreters/RewriteSumIfFunctionVisitor.h
deleted file mode 100644
index b3280e0c7c6..00000000000
--- a/src/Interpreters/RewriteSumIfFunctionVisitor.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-#include <unordered_set>
-
-#include <Parsers/IAST.h>
-#include <Interpreters/InDepthNodeVisitor.h>
-
-namespace DB
-{
-
-class ASTFunction;
-
-/// Rewrite 'sum(if())' and 'sumIf' functions to counIf.
-/// sumIf(1, cond) -> countIf(1, cond)
-/// sumIf(123, cond) -> 123 * countIf(1, cond)
-/// sum(if(cond, 1, 0)) -> countIf(cond)
-/// sum(if(cond, 123, 0)) -> 123 * countIf(cond)
-/// sum(if(cond, 0, 1)) -> countIf(not(cond))
-/// sum(if(cond, 0, 123)) -> 123 * countIf(not(cond))
-class RewriteSumIfFunctionMatcher
-{
-public:
-    struct Data
-    {
-    };
-
-    static void visit(ASTPtr & ast, Data &);
-    static void visit(const ASTFunction &, ASTPtr & ast, Data &);
-    static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; }
-};
-
-using RewriteSumIfFunctionVisitor = InDepthNodeVisitor<RewriteSumIfFunctionMatcher, false>;
-}
diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp
index 7b979088170..07c2cc15b95 100644
--- a/src/Interpreters/TreeOptimizer.cpp
+++ b/src/Interpreters/TreeOptimizer.cpp
@@ -22,7 +22,6 @@
 #include <Interpreters/Context.h>
 #include <Interpreters/ExternalDictionariesLoader.h>
 #include <Interpreters/GatherFunctionQuantileVisitor.h>
-#include <Interpreters/RewriteSumIfFunctionVisitor.h>
 #include <Interpreters/RewriteArrayExistsFunctionVisitor.h>
 #include <Interpreters/RewriteSumFunctionWithSumAndCountVisitor.h>
 #include <Interpreters/OptimizeDateOrDateTimeConverterWithPreimageVisitor.h>
@@ -602,12 +601,6 @@ void optimizeAggregationFunctions(ASTPtr & query)
     ArithmeticOperationsInAgrFuncVisitor(data).visit(query);
 }
 
-void optimizeSumIfFunctions(ASTPtr & query)
-{
-    RewriteSumIfFunctionVisitor::Data data = {};
-    RewriteSumIfFunctionVisitor(data).visit(query);
-}
-
 void optimizeArrayExistsFunctions(ASTPtr & query)
 {
     RewriteArrayExistsFunctionVisitor::Data data = {};
@@ -768,9 +761,6 @@ void TreeOptimizer::apply(ASTPtr & query, TreeRewriterResult & result,
     if (settings.optimize_normalize_count_variants)
         optimizeCountConstantAndSumOne(query, context);
 
-    if (settings.optimize_rewrite_sum_if_to_count_if)
-        optimizeSumIfFunctions(query);
-
     if (settings.optimize_rewrite_array_exists_to_has)
         optimizeArrayExistsFunctions(query);
 

From 3344f4342f23194fb1cbabc6cc4c42bcfa1b8829 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Wed, 24 Apr 2024 21:47:07 +0300
Subject: [PATCH 048/192] UUIDv7 tests & fix for MS UUID representation in
 UUIDToNum (#2)

* Update 02310_generate_multi_columns_with_uuid.sql
* Update 02310_generate_multi_columns_with_uuid.reference
* Update 00396_uuid.sql
* Update 00396_uuid.reference
* Update FunctionsCodingUUID.cpp
---
 src/Functions/FunctionsCodingUUID.cpp          |  2 ++
 tests/queries/0_stateless/00396_uuid.reference |  4 ++++
 tests/queries/0_stateless/00396_uuid.sql       |  5 +++++
 ..._generate_multi_columns_with_uuid.reference |  9 +++++++++
 .../02310_generate_multi_columns_with_uuid.sql | 18 ++++++++++++++++++
 5 files changed, 38 insertions(+)

diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index dceff894c34..889eec866a8 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -385,6 +385,8 @@ public:
                 {
                     std::swap(vec_res[dst_offset], vec_res[dst_offset + 3]);
                     std::swap(vec_res[dst_offset + 1], vec_res[dst_offset + 2]);
+                    std::swap(vec_res[dst_offset + 4], vec_res[dst_offset + 5]);
+                    std::swap(vec_res[dst_offset + 6], vec_res[dst_offset + 7]);
                 }
                 dst_offset += uuid_bytes_length;
             }
diff --git a/tests/queries/0_stateless/00396_uuid.reference b/tests/queries/0_stateless/00396_uuid.reference
index 588f11cb466..3d6d79b111a 100644
--- a/tests/queries/0_stateless/00396_uuid.reference
+++ b/tests/queries/0_stateless/00396_uuid.reference
@@ -11,3 +11,7 @@
 33221100554477668899AABBCCDDEEFF
 00112233-4455-6677-8899-aabbccddeeff
 00112233-4455-6677-8899-aabbccddeeff
+1
+1
+-- UUIDv7toDateTime --
+2024-04-22 08:30:29.048
diff --git a/tests/queries/0_stateless/00396_uuid.sql b/tests/queries/0_stateless/00396_uuid.sql
index 4ad659e2464..27c4b79ca35 100644
--- a/tests/queries/0_stateless/00396_uuid.sql
+++ b/tests/queries/0_stateless/00396_uuid.sql
@@ -17,3 +17,8 @@ select hex(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1));
 select hex(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2));
 select UUIDNumToString(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1), 1);
 select UUIDNumToString(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2), 2);
+select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1)
+select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2)
+
+select '-- UUIDv7toDateTime --';
+select UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference
index bb5ee5c21eb..62e2b30faef 100644
--- a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference
+++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference
@@ -1,3 +1,12 @@
 0
 0
 1
+0
+0
+1
+0
+0
+1
+0
+0
+1
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql
index 3ab19446b3e..c3d337e85d0 100644
--- a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql
+++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql
@@ -3,3 +3,21 @@ SELECT generateUUIDv4(1) = generateUUIDv4(2);
 SELECT generateUUIDv4() = generateUUIDv4(1);
 
 SELECT generateUUIDv4(1) = generateUUIDv4(1);
+
+SELECT generateUUIDv7(1) = generateUUIDv7(2);
+
+SELECT generateUUIDv7() = generateUUIDv7(1);
+
+SELECT generateUUIDv7(1) = generateUUIDv7(1);
+
+SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(2);
+
+SELECT generateUUIDv7WithCounter() = generateUUIDv7WithCounter(1);
+
+SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(1);
+
+SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(2);
+
+SELECT generateUUIDv7WithFastCounter() = generateUUIDv7WithFastCounter(1);
+
+SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(1);

From 4fb26f0e7b0ae91947ca750dd6aa9fb41cefca35 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Wed, 24 Apr 2024 22:06:16 +0300
Subject: [PATCH 049/192] Update 00396_uuid.sql

---
 tests/queries/0_stateless/00396_uuid.sql | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/queries/0_stateless/00396_uuid.sql b/tests/queries/0_stateless/00396_uuid.sql
index 27c4b79ca35..301a21a1981 100644
--- a/tests/queries/0_stateless/00396_uuid.sql
+++ b/tests/queries/0_stateless/00396_uuid.sql
@@ -17,8 +17,8 @@ select hex(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1));
 select hex(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2));
 select UUIDNumToString(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1), 1);
 select UUIDNumToString(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2), 2);
-select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1)
-select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2)
+select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1);
+select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2);
 
 select '-- UUIDv7toDateTime --';
-select UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
+select UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');

From de99a4ba28982a73918f7ca5fcbe98f04dd1a326 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Wed, 24 Apr 2024 22:34:38 +0300
Subject: [PATCH 050/192] UUIDv7 tests in separate files (#3)

* Create 00396_uuid_v7.sql
* Create 00396_uuid_v7.reference
* Create 02310_generate_multi_columns_with_uuid_v7.sql
* Create 02310_generate_multi_columns_with_uuid_v7.reference
---
 tests/queries/0_stateless/00396_uuid.reference |  4 ----
 tests/queries/0_stateless/00396_uuid.sql       |  5 -----
 .../0_stateless/00396_uuid_v7.reference        |  5 +++++
 tests/queries/0_stateless/00396_uuid_v7.sql    |  6 ++++++
 ..._generate_multi_columns_with_uuid.reference |  9 ---------
 .../02310_generate_multi_columns_with_uuid.sql | 18 ------------------
 ...nerate_multi_columns_with_uuid_v7.reference |  9 +++++++++
 ...310_generate_multi_columns_with_uuid_v7.sql | 17 +++++++++++++++++
 8 files changed, 37 insertions(+), 36 deletions(-)
 create mode 100644 tests/queries/0_stateless/00396_uuid_v7.reference
 create mode 100644 tests/queries/0_stateless/00396_uuid_v7.sql
 create mode 100644 tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference
 create mode 100644 tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql

diff --git a/tests/queries/0_stateless/00396_uuid.reference b/tests/queries/0_stateless/00396_uuid.reference
index 3d6d79b111a..588f11cb466 100644
--- a/tests/queries/0_stateless/00396_uuid.reference
+++ b/tests/queries/0_stateless/00396_uuid.reference
@@ -11,7 +11,3 @@
 33221100554477668899AABBCCDDEEFF
 00112233-4455-6677-8899-aabbccddeeff
 00112233-4455-6677-8899-aabbccddeeff
-1
-1
--- UUIDv7toDateTime --
-2024-04-22 08:30:29.048
diff --git a/tests/queries/0_stateless/00396_uuid.sql b/tests/queries/0_stateless/00396_uuid.sql
index 301a21a1981..4ad659e2464 100644
--- a/tests/queries/0_stateless/00396_uuid.sql
+++ b/tests/queries/0_stateless/00396_uuid.sql
@@ -17,8 +17,3 @@ select hex(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1));
 select hex(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2));
 select UUIDNumToString(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1), 1);
 select UUIDNumToString(UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2), 2);
-select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1);
-select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2);
-
-select '-- UUIDv7toDateTime --';
-select UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
diff --git a/tests/queries/0_stateless/00396_uuid_v7.reference b/tests/queries/0_stateless/00396_uuid_v7.reference
new file mode 100644
index 00000000000..46a80dcf19b
--- /dev/null
+++ b/tests/queries/0_stateless/00396_uuid_v7.reference
@@ -0,0 +1,5 @@
+-- UUIDToNum --
+1
+1
+-- UUIDv7toDateTime --
+2024-04-22 08:30:29.048
diff --git a/tests/queries/0_stateless/00396_uuid_v7.sql b/tests/queries/0_stateless/00396_uuid_v7.sql
new file mode 100644
index 00000000000..9ba2aaa58e9
--- /dev/null
+++ b/tests/queries/0_stateless/00396_uuid_v7.sql
@@ -0,0 +1,6 @@
+select '-- UUIDToNum --';
+select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1);
+select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2);
+
+select '-- UUIDv7toDateTime --';
+select UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference
index 62e2b30faef..bb5ee5c21eb 100644
--- a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference
+++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.reference
@@ -1,12 +1,3 @@
 0
 0
 1
-0
-0
-1
-0
-0
-1
-0
-0
-1
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql
index c3d337e85d0..3ab19446b3e 100644
--- a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql
+++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid.sql
@@ -3,21 +3,3 @@ SELECT generateUUIDv4(1) = generateUUIDv4(2);
 SELECT generateUUIDv4() = generateUUIDv4(1);
 
 SELECT generateUUIDv4(1) = generateUUIDv4(1);
-
-SELECT generateUUIDv7(1) = generateUUIDv7(2);
-
-SELECT generateUUIDv7() = generateUUIDv7(1);
-
-SELECT generateUUIDv7(1) = generateUUIDv7(1);
-
-SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(2);
-
-SELECT generateUUIDv7WithCounter() = generateUUIDv7WithCounter(1);
-
-SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(1);
-
-SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(2);
-
-SELECT generateUUIDv7WithFastCounter() = generateUUIDv7WithFastCounter(1);
-
-SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(1);
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference
new file mode 100644
index 00000000000..b6d3cdbe300
--- /dev/null
+++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference
@@ -0,0 +1,9 @@
+0
+0
+1
+0
+0
+1
+0
+0
+1
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql
new file mode 100644
index 00000000000..39e62185099
--- /dev/null
+++ b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql
@@ -0,0 +1,17 @@
+SELECT generateUUIDv7(1) = generateUUIDv7(2);
+
+SELECT generateUUIDv7() = generateUUIDv7(1);
+
+SELECT generateUUIDv7(1) = generateUUIDv7(1);
+
+SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(2);
+
+SELECT generateUUIDv7WithCounter() = generateUUIDv7WithCounter(1);
+
+SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(1);
+
+SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(2);
+
+SELECT generateUUIDv7WithFastCounter() = generateUUIDv7WithFastCounter(1);
+
+SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(1);

From 3941d69c6be785ae101ed82e3db836a36f07ac2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Thu, 25 Apr 2024 20:25:40 +0200
Subject: [PATCH 051/192] Fix optimize_rewrite_aggregate_function_with_if
 implicit cast

---
 .../RewriteAggregateFunctionWithIfPass.cpp    |  48 +++---
 ...write_aggregate_function_with_if.reference | 160 +++++++++++-------
 ...518_rewrite_aggregate_function_with_if.sql |   1 +
 ...e_function_with_if_implicit_cast.reference | 111 ++++++++++++
 ...gregate_function_with_if_implicit_cast.sql |   8 +
 5 files changed, 251 insertions(+), 77 deletions(-)
 create mode 100644 tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.reference
 create mode 100644 tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.sql

diff --git a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp
index 9e3840a083e..513dd0054d6 100644
--- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp
+++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp
@@ -10,9 +10,10 @@
 
 #include <Interpreters/Context.h>
 
-#include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/ConstantNode.h>
 #include <Analyzer/FunctionNode.h>
+#include <Analyzer/InDepthQueryTreeVisitor.h>
+#include <Analyzer/Utils.h>
 
 namespace DB
 {
@@ -52,17 +53,24 @@ public:
             const auto & second_const_value = second_const_node->getValue();
             if (second_const_value.isNull()
                 || (lower_name == "sum" && isInt64OrUInt64FieldType(second_const_value.getType()) && second_const_value.get<UInt64>() == 0
-                    && !function_node->getResultType()->isNullable()))
+                    && !if_node->getResultType()->isNullable()))
             {
-                /// avg(if(cond, a, null)) -> avgIf(a, cond)
-                /// avg(if(cond, nullable_a, null)) -> avgIfOrNull(a, cond)
-
+                /// avg(if(cond, a, null)) -> avgIf(a::ResultTypeIf, cond)
+                /// avg(if(cond, nullable_a, null)) -> avgIf(nullable_a, cond)
                 /// sum(if(cond, a, 0)) -> sumIf(a, cond)
                 /// sum(if(cond, nullable_a, 0)) **is not** equivalent to sumIfOrNull(cond, nullable_a) as
                 ///     it changes the output when no rows pass the condition (from 0 to NULL)
-                function_arguments_nodes.resize(2);
-                function_arguments_nodes[0] = std::move(if_arguments_nodes[1]);
-                function_arguments_nodes[1] = std::move(if_arguments_nodes[0]);
+
+                QueryTreeNodes new_arguments{2};
+
+                /// We need to preserve the output type from if()
+                if (if_arguments_nodes[1]->getResultType()->getName() != if_node->getResultType()->getName())
+                    new_arguments[0] = createCastFunction(std::move(if_arguments_nodes[1]), if_node->getResultType(), getContext());
+                else
+                    new_arguments[0] = std::move(if_arguments_nodes[1]);
+
+                new_arguments[1] = std::move(if_arguments_nodes[0]);
+                function_arguments_nodes = std::move(new_arguments);
                 resolveAsAggregateFunctionWithIf(
                     *function_node, {function_arguments_nodes[0]->getResultType(), function_arguments_nodes[1]->getResultType()});
             }
@@ -72,21 +80,27 @@ public:
             const auto & first_const_value = first_const_node->getValue();
             if (first_const_value.isNull()
                 || (lower_name == "sum" && isInt64OrUInt64FieldType(first_const_value.getType()) && first_const_value.get<UInt64>() == 0
-                    && !function_node->getResultType()->isNullable()))
+                    && !if_node->getResultType()->isNullable()))
             {
-                /// avg(if(cond, null, a) -> avgIfOrNullable(a, !cond))
-
+                /// avg(if(cond, null, a) -> avgIf(a::ResultTypeIf, !cond))
                 /// sum(if(cond, 0, a) -> sumIf(a, !cond))
                 /// sum(if(cond, 0, nullable_a) **is not** sumIf(a, !cond)) -> Same as above
+
+                QueryTreeNodes new_arguments{2};
+
+                if (if_arguments_nodes[2]->getResultType()->getName() != if_node->getResultType()->getName())
+                    new_arguments[0] = createCastFunction(std::move(if_arguments_nodes[2]), if_node->getResultType(), getContext());
+                else
+                    new_arguments[0] = std::move(if_arguments_nodes[2]);
+
                 auto not_function = std::make_shared<FunctionNode>("not");
                 auto & not_function_arguments = not_function->getArguments().getNodes();
                 not_function_arguments.push_back(std::move(if_arguments_nodes[0]));
                 not_function->resolveAsFunction(
                     FunctionFactory::instance().get("not", getContext())->build(not_function->getArgumentColumns()));
+                new_arguments[1] = std::move(not_function);
 
-                function_arguments_nodes.resize(2);
-                function_arguments_nodes[0] = std::move(if_arguments_nodes[2]);
-                function_arguments_nodes[1] = std::move(not_function);
+                function_arguments_nodes = std::move(new_arguments);
                 resolveAsAggregateFunctionWithIf(
                     *function_node, {function_arguments_nodes[0]->getResultType(), function_arguments_nodes[1]->getResultType()});
             }
@@ -98,13 +112,9 @@ private:
     {
         auto result_type = function_node.getResultType();
 
-        std::string suffix = "If";
-        if (result_type->isNullable())
-            suffix = "OrNullIf";
-
         AggregateFunctionProperties properties;
         auto aggregate_function = AggregateFunctionFactory::instance().get(
-            function_node.getFunctionName() + suffix,
+            function_node.getFunctionName() + "If",
             function_node.getNullsAction(),
             argument_types,
             function_node.getAggregateFunction()->getParameters(),
diff --git a/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.reference b/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.reference
index 15543789c1d..42a9a0a2481 100644
--- a/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.reference
+++ b/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.reference
@@ -1,3 +1,7 @@
+-- { echoOn }
+
+set optimize_rewrite_aggregate_function_with_if = false;
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, number, 0)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), number, 0)) UInt64
@@ -21,6 +25,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 12, nodes: 1
           CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, 0, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), 0, number)) UInt64
@@ -44,6 +49,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 12, nodes: 1
           CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, number, null)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), number, NULL)) Nullable(UInt64)
@@ -67,6 +73,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 12, nodes: 1
           CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, null, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), NULL, number)) Nullable(UInt64)
@@ -90,6 +97,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 12, nodes: 1
           CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select avg(if(number % 2, number, null)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     avg(if(modulo(number, 2), number, NULL)) Nullable(Float64)
@@ -113,6 +121,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 12, nodes: 1
           CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select avg(if(number % 2, null, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     avg(if(modulo(number, 2), NULL, number)) Nullable(Float64)
@@ -136,6 +145,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 12, nodes: 1
           CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select quantiles(0.5, 0.9, 0.99)(if(number % 2, number, null)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     quantiles(0.5, 0.9, 0.99)(if(modulo(number, 2), number, NULL)) Array(Float64)
@@ -164,6 +174,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 16, nodes: 1
           CONSTANT id: 17, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select quantiles(0.5, 0.9, 0.99)(if(number % 2, null, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     quantiles(0.5, 0.9, 0.99)(if(modulo(number, 2), NULL, number)) Array(Float64)
@@ -192,6 +203,8 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 16, nodes: 1
           CONSTANT id: 17, constant_value: UInt64_100, constant_value_type: UInt8
+set optimize_rewrite_aggregate_function_with_if = true;
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, number, 0)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), number, 0)) UInt64
@@ -211,6 +224,7 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 9, nodes: 1
           CONSTANT id: 10, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, 0, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), 0, number)) UInt64
@@ -233,88 +247,109 @@ QUERY id: 0
       ARGUMENTS
         LIST id: 11, nodes: 1
           CONSTANT id: 12, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, number, null)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), number, NULL)) Nullable(UInt64)
   PROJECTION
     LIST id: 1, nodes: 1
-      FUNCTION id: 2, function_name: sumOrNullIf, function_type: aggregate, result_type: Nullable(UInt64)
+      FUNCTION id: 2, function_name: sumIf, function_type: aggregate, result_type: Nullable(UInt64)
         ARGUMENTS
           LIST id: 3, nodes: 2
-            COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-            FUNCTION id: 6, function_name: modulo, function_type: ordinary, result_type: UInt8
+            FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(UInt64)
               ARGUMENTS
-                LIST id: 7, nodes: 2
-                  COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-                  CONSTANT id: 8, constant_value: UInt64_2, constant_value_type: UInt8
+                LIST id: 5, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 8, constant_value: \'Nullable(UInt64)\', constant_value_type: String
+            FUNCTION id: 9, function_name: modulo, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 10, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 11, constant_value: UInt64_2, constant_value_type: UInt8
   JOIN TREE
-    TABLE_FUNCTION id: 5, alias: __table1, table_function_name: numbers
+    TABLE_FUNCTION id: 7, alias: __table1, table_function_name: numbers
       ARGUMENTS
-        LIST id: 9, nodes: 1
-          CONSTANT id: 10, constant_value: UInt64_100, constant_value_type: UInt8
+        LIST id: 12, nodes: 1
+          CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, null, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     sum(if(modulo(number, 2), NULL, number)) Nullable(UInt64)
   PROJECTION
     LIST id: 1, nodes: 1
-      FUNCTION id: 2, function_name: sumOrNullIf, function_type: aggregate, result_type: Nullable(UInt64)
+      FUNCTION id: 2, function_name: sumIf, function_type: aggregate, result_type: Nullable(UInt64)
         ARGUMENTS
           LIST id: 3, nodes: 2
-            COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-            FUNCTION id: 6, function_name: not, function_type: ordinary, result_type: UInt8
+            FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(UInt64)
               ARGUMENTS
-                LIST id: 7, nodes: 1
-                  FUNCTION id: 8, function_name: modulo, function_type: ordinary, result_type: UInt8
+                LIST id: 5, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 8, constant_value: \'Nullable(UInt64)\', constant_value_type: String
+            FUNCTION id: 9, function_name: not, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 10, nodes: 1
+                  FUNCTION id: 11, function_name: modulo, function_type: ordinary, result_type: UInt8
                     ARGUMENTS
-                      LIST id: 9, nodes: 2
-                        COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-                        CONSTANT id: 10, constant_value: UInt64_2, constant_value_type: UInt8
+                      LIST id: 12, nodes: 2
+                        COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                        CONSTANT id: 13, constant_value: UInt64_2, constant_value_type: UInt8
   JOIN TREE
-    TABLE_FUNCTION id: 5, alias: __table1, table_function_name: numbers
+    TABLE_FUNCTION id: 7, alias: __table1, table_function_name: numbers
       ARGUMENTS
-        LIST id: 11, nodes: 1
-          CONSTANT id: 12, constant_value: UInt64_100, constant_value_type: UInt8
+        LIST id: 14, nodes: 1
+          CONSTANT id: 15, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select avg(if(number % 2, number, null)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     avg(if(modulo(number, 2), number, NULL)) Nullable(Float64)
   PROJECTION
     LIST id: 1, nodes: 1
-      FUNCTION id: 2, function_name: avgOrNullIf, function_type: aggregate, result_type: Nullable(Float64)
+      FUNCTION id: 2, function_name: avgIf, function_type: aggregate, result_type: Nullable(Float64)
         ARGUMENTS
           LIST id: 3, nodes: 2
-            COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-            FUNCTION id: 6, function_name: modulo, function_type: ordinary, result_type: UInt8
+            FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(UInt64)
               ARGUMENTS
-                LIST id: 7, nodes: 2
-                  COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-                  CONSTANT id: 8, constant_value: UInt64_2, constant_value_type: UInt8
+                LIST id: 5, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 8, constant_value: \'Nullable(UInt64)\', constant_value_type: String
+            FUNCTION id: 9, function_name: modulo, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 10, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 11, constant_value: UInt64_2, constant_value_type: UInt8
   JOIN TREE
-    TABLE_FUNCTION id: 5, alias: __table1, table_function_name: numbers
+    TABLE_FUNCTION id: 7, alias: __table1, table_function_name: numbers
       ARGUMENTS
-        LIST id: 9, nodes: 1
-          CONSTANT id: 10, constant_value: UInt64_100, constant_value_type: UInt8
+        LIST id: 12, nodes: 1
+          CONSTANT id: 13, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select avg(if(number % 2, null, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     avg(if(modulo(number, 2), NULL, number)) Nullable(Float64)
   PROJECTION
     LIST id: 1, nodes: 1
-      FUNCTION id: 2, function_name: avgOrNullIf, function_type: aggregate, result_type: Nullable(Float64)
+      FUNCTION id: 2, function_name: avgIf, function_type: aggregate, result_type: Nullable(Float64)
         ARGUMENTS
           LIST id: 3, nodes: 2
-            COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-            FUNCTION id: 6, function_name: not, function_type: ordinary, result_type: UInt8
+            FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(UInt64)
               ARGUMENTS
-                LIST id: 7, nodes: 1
-                  FUNCTION id: 8, function_name: modulo, function_type: ordinary, result_type: UInt8
+                LIST id: 5, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 8, constant_value: \'Nullable(UInt64)\', constant_value_type: String
+            FUNCTION id: 9, function_name: not, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 10, nodes: 1
+                  FUNCTION id: 11, function_name: modulo, function_type: ordinary, result_type: UInt8
                     ARGUMENTS
-                      LIST id: 9, nodes: 2
-                        COLUMN id: 4, column_name: number, result_type: UInt64, source_id: 5
-                        CONSTANT id: 10, constant_value: UInt64_2, constant_value_type: UInt8
+                      LIST id: 12, nodes: 2
+                        COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                        CONSTANT id: 13, constant_value: UInt64_2, constant_value_type: UInt8
   JOIN TREE
-    TABLE_FUNCTION id: 5, alias: __table1, table_function_name: numbers
+    TABLE_FUNCTION id: 7, alias: __table1, table_function_name: numbers
       ARGUMENTS
-        LIST id: 11, nodes: 1
-          CONSTANT id: 12, constant_value: UInt64_100, constant_value_type: UInt8
+        LIST id: 14, nodes: 1
+          CONSTANT id: 15, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select quantiles(0.5, 0.9, 0.99)(if(number % 2, number, null)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     quantiles(0.5, 0.9, 0.99)(if(modulo(number, 2), number, NULL)) Array(Float64)
@@ -328,17 +363,22 @@ QUERY id: 0
             CONSTANT id: 6, constant_value: Float64_0.99, constant_value_type: Float64
         ARGUMENTS
           LIST id: 7, nodes: 2
-            COLUMN id: 8, column_name: number, result_type: UInt64, source_id: 9
-            FUNCTION id: 10, function_name: modulo, function_type: ordinary, result_type: UInt8
+            FUNCTION id: 8, function_name: _CAST, function_type: ordinary, result_type: Nullable(UInt64)
               ARGUMENTS
-                LIST id: 11, nodes: 2
-                  COLUMN id: 8, column_name: number, result_type: UInt64, source_id: 9
-                  CONSTANT id: 12, constant_value: UInt64_2, constant_value_type: UInt8
+                LIST id: 9, nodes: 2
+                  COLUMN id: 10, column_name: number, result_type: UInt64, source_id: 11
+                  CONSTANT id: 12, constant_value: \'Nullable(UInt64)\', constant_value_type: String
+            FUNCTION id: 13, function_name: modulo, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 14, nodes: 2
+                  COLUMN id: 10, column_name: number, result_type: UInt64, source_id: 11
+                  CONSTANT id: 15, constant_value: UInt64_2, constant_value_type: UInt8
   JOIN TREE
-    TABLE_FUNCTION id: 9, alias: __table1, table_function_name: numbers
+    TABLE_FUNCTION id: 11, alias: __table1, table_function_name: numbers
       ARGUMENTS
-        LIST id: 13, nodes: 1
-          CONSTANT id: 14, constant_value: UInt64_100, constant_value_type: UInt8
+        LIST id: 16, nodes: 1
+          CONSTANT id: 17, constant_value: UInt64_100, constant_value_type: UInt8
+EXPLAIN QUERY TREE run_passes = 1 select quantiles(0.5, 0.9, 0.99)(if(number % 2, null, number)) from numbers(100);
 QUERY id: 0
   PROJECTION COLUMNS
     quantiles(0.5, 0.9, 0.99)(if(modulo(number, 2), NULL, number)) Array(Float64)
@@ -352,17 +392,21 @@ QUERY id: 0
             CONSTANT id: 6, constant_value: Float64_0.99, constant_value_type: Float64
         ARGUMENTS
           LIST id: 7, nodes: 2
-            COLUMN id: 8, column_name: number, result_type: UInt64, source_id: 9
-            FUNCTION id: 10, function_name: not, function_type: ordinary, result_type: UInt8
+            FUNCTION id: 8, function_name: _CAST, function_type: ordinary, result_type: Nullable(UInt64)
               ARGUMENTS
-                LIST id: 11, nodes: 1
-                  FUNCTION id: 12, function_name: modulo, function_type: ordinary, result_type: UInt8
+                LIST id: 9, nodes: 2
+                  COLUMN id: 10, column_name: number, result_type: UInt64, source_id: 11
+                  CONSTANT id: 12, constant_value: \'Nullable(UInt64)\', constant_value_type: String
+            FUNCTION id: 13, function_name: not, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 14, nodes: 1
+                  FUNCTION id: 15, function_name: modulo, function_type: ordinary, result_type: UInt8
                     ARGUMENTS
-                      LIST id: 13, nodes: 2
-                        COLUMN id: 8, column_name: number, result_type: UInt64, source_id: 9
-                        CONSTANT id: 14, constant_value: UInt64_2, constant_value_type: UInt8
+                      LIST id: 16, nodes: 2
+                        COLUMN id: 10, column_name: number, result_type: UInt64, source_id: 11
+                        CONSTANT id: 17, constant_value: UInt64_2, constant_value_type: UInt8
   JOIN TREE
-    TABLE_FUNCTION id: 9, alias: __table1, table_function_name: numbers
+    TABLE_FUNCTION id: 11, alias: __table1, table_function_name: numbers
       ARGUMENTS
-        LIST id: 15, nodes: 1
-          CONSTANT id: 16, constant_value: UInt64_100, constant_value_type: UInt8
+        LIST id: 18, nodes: 1
+          CONSTANT id: 19, constant_value: UInt64_100, constant_value_type: UInt8
diff --git a/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.sql b/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.sql
index 934bf6cacee..fe882da67cb 100644
--- a/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.sql
+++ b/tests/queries/0_stateless/02518_rewrite_aggregate_function_with_if.sql
@@ -1,4 +1,5 @@
 set allow_experimental_analyzer = true;
+-- { echoOn }
 
 set optimize_rewrite_aggregate_function_with_if = false;
 EXPLAIN QUERY TREE run_passes = 1 select sum(if(number % 2, number, 0)) from numbers(100);
diff --git a/tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.reference b/tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.reference
new file mode 100644
index 00000000000..070a6c9e21c
--- /dev/null
+++ b/tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.reference
@@ -0,0 +1,111 @@
+-- { echoOn }
+SELECT concat(1, sum(if(0, toUInt128(concat('%', toLowCardinality(toNullable(1)), toUInt256(1))), materialize(0))));
+10
+SELECT any(if((number % 10) = 5, number, CAST(NULL, 'Nullable(Int128)'))) AS a, toTypeName(a) FROM numbers(100) AS a;
+5	Nullable(Int128)
+EXPLAIN QUERY TREE SELECT any(if((number % 10) = 5, number, CAST(NULL, 'Nullable(Int128)'))) AS a, toTypeName(a) FROM numbers(100);
+QUERY id: 0
+  PROJECTION COLUMNS
+    a Nullable(Int128)
+    toTypeName(a) String
+  PROJECTION
+    LIST id: 1, nodes: 2
+      FUNCTION id: 2, function_name: anyIf, function_type: aggregate, result_type: Nullable(Int128)
+        ARGUMENTS
+          LIST id: 3, nodes: 2
+            FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(Int128)
+              ARGUMENTS
+                LIST id: 5, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 8, constant_value: \'Nullable(Int128)\', constant_value_type: String
+            FUNCTION id: 9, function_name: equals, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 10, nodes: 2
+                  FUNCTION id: 11, function_name: modulo, function_type: ordinary, result_type: UInt8
+                    ARGUMENTS
+                      LIST id: 12, nodes: 2
+                        COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                        CONSTANT id: 13, constant_value: UInt64_10, constant_value_type: UInt8
+                  CONSTANT id: 14, constant_value: UInt64_5, constant_value_type: UInt8
+      FUNCTION id: 15, function_name: toTypeName, function_type: ordinary, result_type: String
+        ARGUMENTS
+          LIST id: 16, nodes: 1
+            FUNCTION id: 2, function_name: anyIf, function_type: aggregate, result_type: Nullable(Int128)
+              ARGUMENTS
+                LIST id: 3, nodes: 2
+                  FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(Int128)
+                    ARGUMENTS
+                      LIST id: 5, nodes: 2
+                        COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                        CONSTANT id: 8, constant_value: \'Nullable(Int128)\', constant_value_type: String
+                  FUNCTION id: 9, function_name: equals, function_type: ordinary, result_type: UInt8
+                    ARGUMENTS
+                      LIST id: 10, nodes: 2
+                        FUNCTION id: 11, function_name: modulo, function_type: ordinary, result_type: UInt8
+                          ARGUMENTS
+                            LIST id: 12, nodes: 2
+                              COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                              CONSTANT id: 13, constant_value: UInt64_10, constant_value_type: UInt8
+                        CONSTANT id: 14, constant_value: UInt64_5, constant_value_type: UInt8
+  JOIN TREE
+    TABLE_FUNCTION id: 7, alias: __table1, table_function_name: numbers
+      ARGUMENTS
+        LIST id: 17, nodes: 1
+          CONSTANT id: 18, constant_value: UInt64_100, constant_value_type: UInt8
+SELECT any(if((number % 10) = 5, CAST(NULL, 'Nullable(Int128)'), number)) AS a, toTypeName(a) FROM numbers(100) AS a;
+0	Nullable(Int128)
+EXPLAIN QUERY TREE SELECT any(if((number % 10) = 5, CAST(NULL, 'Nullable(Int128)'), number)) AS a, toTypeName(a) FROM numbers(100);
+QUERY id: 0
+  PROJECTION COLUMNS
+    a Nullable(Int128)
+    toTypeName(a) String
+  PROJECTION
+    LIST id: 1, nodes: 2
+      FUNCTION id: 2, function_name: anyIf, function_type: aggregate, result_type: Nullable(Int128)
+        ARGUMENTS
+          LIST id: 3, nodes: 2
+            FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(Int128)
+              ARGUMENTS
+                LIST id: 5, nodes: 2
+                  COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                  CONSTANT id: 8, constant_value: \'Nullable(Int128)\', constant_value_type: String
+            FUNCTION id: 9, function_name: not, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 10, nodes: 1
+                  FUNCTION id: 11, function_name: equals, function_type: ordinary, result_type: UInt8
+                    ARGUMENTS
+                      LIST id: 12, nodes: 2
+                        FUNCTION id: 13, function_name: modulo, function_type: ordinary, result_type: UInt8
+                          ARGUMENTS
+                            LIST id: 14, nodes: 2
+                              COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                              CONSTANT id: 15, constant_value: UInt64_10, constant_value_type: UInt8
+                        CONSTANT id: 16, constant_value: UInt64_5, constant_value_type: UInt8
+      FUNCTION id: 17, function_name: toTypeName, function_type: ordinary, result_type: String
+        ARGUMENTS
+          LIST id: 18, nodes: 1
+            FUNCTION id: 2, function_name: anyIf, function_type: aggregate, result_type: Nullable(Int128)
+              ARGUMENTS
+                LIST id: 3, nodes: 2
+                  FUNCTION id: 4, function_name: _CAST, function_type: ordinary, result_type: Nullable(Int128)
+                    ARGUMENTS
+                      LIST id: 5, nodes: 2
+                        COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                        CONSTANT id: 8, constant_value: \'Nullable(Int128)\', constant_value_type: String
+                  FUNCTION id: 9, function_name: not, function_type: ordinary, result_type: UInt8
+                    ARGUMENTS
+                      LIST id: 10, nodes: 1
+                        FUNCTION id: 11, function_name: equals, function_type: ordinary, result_type: UInt8
+                          ARGUMENTS
+                            LIST id: 12, nodes: 2
+                              FUNCTION id: 13, function_name: modulo, function_type: ordinary, result_type: UInt8
+                                ARGUMENTS
+                                  LIST id: 14, nodes: 2
+                                    COLUMN id: 6, column_name: number, result_type: UInt64, source_id: 7
+                                    CONSTANT id: 15, constant_value: UInt64_10, constant_value_type: UInt8
+                              CONSTANT id: 16, constant_value: UInt64_5, constant_value_type: UInt8
+  JOIN TREE
+    TABLE_FUNCTION id: 7, alias: __table1, table_function_name: numbers
+      ARGUMENTS
+        LIST id: 19, nodes: 1
+          CONSTANT id: 20, constant_value: UInt64_100, constant_value_type: UInt8
diff --git a/tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.sql b/tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.sql
new file mode 100644
index 00000000000..4e8096fbed1
--- /dev/null
+++ b/tests/queries/0_stateless/03132_rewrite_aggregate_function_with_if_implicit_cast.sql
@@ -0,0 +1,8 @@
+SET allow_experimental_analyzer = 1;
+-- { echoOn }
+SELECT concat(1, sum(if(0, toUInt128(concat('%', toLowCardinality(toNullable(1)), toUInt256(1))), materialize(0))));
+SELECT any(if((number % 10) = 5, number, CAST(NULL, 'Nullable(Int128)'))) AS a, toTypeName(a) FROM numbers(100) AS a;
+EXPLAIN QUERY TREE SELECT any(if((number % 10) = 5, number, CAST(NULL, 'Nullable(Int128)'))) AS a, toTypeName(a) FROM numbers(100);
+
+SELECT any(if((number % 10) = 5, CAST(NULL, 'Nullable(Int128)'), number)) AS a, toTypeName(a) FROM numbers(100) AS a;
+EXPLAIN QUERY TREE SELECT any(if((number % 10) = 5, CAST(NULL, 'Nullable(Int128)'), number)) AS a, toTypeName(a) FROM numbers(100);

From 052713db3bf00324b9f51d6d77e1290f8737cf3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Thu, 25 Apr 2024 21:14:40 +0200
Subject: [PATCH 052/192] Fix order and reference files

---
 src/Analyzer/QueryTreePassManager.cpp           |  5 +++--
 .../0_stateless/01646_rewrite_sum_if.reference  | 12 ------------
 .../0_stateless/01646_rewrite_sum_if.sql        | 10 +---------
 ..._combine_multi_if_and_count_if_opt.reference | 17 +++++++++++++++--
 .../02374_combine_multi_if_and_count_if_opt.sql |  6 ++----
 .../02495_sum_if_to_count_if_bug.reference      |  2 +-
 6 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/src/Analyzer/QueryTreePassManager.cpp b/src/Analyzer/QueryTreePassManager.cpp
index 2789423799a..86a3660994f 100644
--- a/src/Analyzer/QueryTreePassManager.cpp
+++ b/src/Analyzer/QueryTreePassManager.cpp
@@ -259,8 +259,6 @@ void addQueryTreePasses(QueryTreePassManager & manager, bool only_analyze)
     manager.addPass(std::make_unique<RewriteSumFunctionWithSumAndCountPass>());
     manager.addPass(std::make_unique<CountDistinctPass>());
     manager.addPass(std::make_unique<UniqToCountPass>());
-    manager.addPass(std::make_unique<RewriteAggregateFunctionWithIfPass>());
-    manager.addPass(std::make_unique<SumIfToCountIfPass>());
     manager.addPass(std::make_unique<RewriteArrayExistsToHasPass>());
     manager.addPass(std::make_unique<NormalizeCountVariantsPass>());
 
@@ -277,9 +275,12 @@ void addQueryTreePasses(QueryTreePassManager & manager, bool only_analyze)
     manager.addPass(std::make_unique<OptimizeGroupByFunctionKeysPass>());
     manager.addPass(std::make_unique<OptimizeGroupByInjectiveFunctionsPass>());
 
+    /// The order here is important as we want to keep collapsing in order
     manager.addPass(std::make_unique<MultiIfToIfPass>());
     manager.addPass(std::make_unique<IfConstantConditionPass>());
     manager.addPass(std::make_unique<IfChainToMultiIfPass>());
+    manager.addPass(std::make_unique<RewriteAggregateFunctionWithIfPass>());
+    manager.addPass(std::make_unique<SumIfToCountIfPass>());
 
     manager.addPass(std::make_unique<ComparisonTupleEliminationPass>());
 
diff --git a/tests/queries/0_stateless/01646_rewrite_sum_if.reference b/tests/queries/0_stateless/01646_rewrite_sum_if.reference
index af582908f03..978c0bfc484 100644
--- a/tests/queries/0_stateless/01646_rewrite_sum_if.reference
+++ b/tests/queries/0_stateless/01646_rewrite_sum_if.reference
@@ -10,12 +10,6 @@
 50
 50	50	50	1	0
 50
-SELECT sumIf(123, (number % 2) = 0)
-FROM numbers(100)
-SELECT sum(if((number % 2) = 0, 123, 0))
-FROM numbers(100)
-SELECT sum(if((number % 2) = 0, 0, 123))
-FROM numbers(100)
 0
 0	0	1
 0
@@ -28,12 +22,6 @@ FROM numbers(100)
 50
 50	50	50	1	0
 50
-SELECT 123 * countIf((number % 2) = 0)
-FROM numbers(100)
-SELECT 123 * countIf((number % 2) = 0)
-FROM numbers(100)
-SELECT 123 * countIf(NOT ((number % 2) = 0))
-FROM numbers(100)
 QUERY id: 0
   PROJECTION COLUMNS
     sumIf(123, equals(modulo(number, 2), 0)) UInt64
diff --git a/tests/queries/0_stateless/01646_rewrite_sum_if.sql b/tests/queries/0_stateless/01646_rewrite_sum_if.sql
index b2de98e9e07..da341a3f7db 100644
--- a/tests/queries/0_stateless/01646_rewrite_sum_if.sql
+++ b/tests/queries/0_stateless/01646_rewrite_sum_if.sql
@@ -16,10 +16,6 @@ SELECT sum(if(number % 2 == 0, 0, 1)) FROM numbers(100);
 SELECT sum(if(number % 2 == 0 as cond_expr, 0 as zero_expr, 1 as one_expr) as if_expr), sum(cond_expr), sum(if_expr), one_expr, zero_expr FROM numbers(100);
 SELECT countIf(number % 2 != 0) FROM numbers(100);
 
-EXPLAIN SYNTAX SELECT sumIf(123, number % 2 == 0) FROM numbers(100);
-EXPLAIN SYNTAX SELECT sum(if(number % 2 == 0, 123, 0)) FROM numbers(100);
-EXPLAIN SYNTAX SELECT sum(if(number % 2 == 0, 0, 123)) FROM numbers(100);
-
 SET optimize_rewrite_sum_if_to_count_if = 1;
 
 SELECT sumIf(1, number % 2 > 2) FROM numbers(100);
@@ -38,12 +34,8 @@ SELECT sum(if(number % 2 == 0, 0, 1)) FROM numbers(100);
 SELECT sum(if(number % 2 == 0 as cond_expr, 0 as zero_expr, 1 as one_expr) as if_expr), sum(cond_expr), sum(if_expr), one_expr, zero_expr FROM numbers(100);
 SELECT countIf(number % 2 != 0) FROM numbers(100);
 
-EXPLAIN SYNTAX SELECT sumIf(123, number % 2 == 0) FROM numbers(100);
-EXPLAIN SYNTAX SELECT sum(if(number % 2 == 0, 123, 0)) FROM numbers(100);
-EXPLAIN SYNTAX SELECT sum(if(number % 2 == 0, 0, 123)) FROM numbers(100);
-
 set allow_experimental_analyzer = true;
 
 EXPLAIN QUERY TREE run_passes=1 SELECT sumIf(123, number % 2 == 0) FROM numbers(100);
 EXPLAIN QUERY TREE run_passes=1 SELECT sum(if(number % 2 == 0, 123, 0)) FROM numbers(100);
-EXPLAIN QUERY TREE run_passes=1 SELECT sum(if(number % 2 == 0, 0, 123)) FROM numbers(100);
\ No newline at end of file
+EXPLAIN QUERY TREE run_passes=1 SELECT sum(if(number % 2 == 0, 0, 123)) FROM numbers(100);
diff --git a/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.reference b/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.reference
index d7efa9ef54c..49b28fea916 100644
--- a/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.reference
+++ b/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.reference
@@ -1,2 +1,15 @@
-SELECT countIf(a = 1)
-FROM m
+QUERY id: 0
+  PROJECTION COLUMNS
+    sum(multiIf(equals(a, 1), 1, 0)) UInt64
+  PROJECTION
+    LIST id: 1, nodes: 1
+      FUNCTION id: 2, function_name: countIf, function_type: aggregate, result_type: UInt64
+        ARGUMENTS
+          LIST id: 3, nodes: 1
+            FUNCTION id: 4, function_name: equals, function_type: ordinary, result_type: UInt8
+              ARGUMENTS
+                LIST id: 5, nodes: 2
+                  COLUMN id: 6, column_name: a, result_type: Int32, source_id: 7
+                  CONSTANT id: 8, constant_value: UInt64_1, constant_value_type: UInt8
+  JOIN TREE
+    TABLE id: 7, alias: __table1, table_name: default.m
diff --git a/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql b/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql
index c3367873042..4371c2e5641 100644
--- a/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql
+++ b/tests/queries/0_stateless/02374_combine_multi_if_and_count_if_opt.sql
@@ -4,10 +4,8 @@ create table m (a int) engine Log;
 
 insert into m values (1);
 
-set optimize_rewrite_sum_if_to_count_if=1;
+set allow_experimental_analyzer = true, optimize_rewrite_sum_if_to_count_if=1;
 
-explain syntax select sum(multiIf(a = 1, 1, 0)) from m;
-
-set optimize_rewrite_sum_if_to_count_if=0;
+EXPLAIN QUERY TREE select sum(multiIf(a = 1, 1, 0)) from m;
 
 drop table m;
diff --git a/tests/queries/0_stateless/02495_sum_if_to_count_if_bug.reference b/tests/queries/0_stateless/02495_sum_if_to_count_if_bug.reference
index 4bda3243d2e..03283f06f42 100644
--- a/tests/queries/0_stateless/02495_sum_if_to_count_if_bug.reference
+++ b/tests/queries/0_stateless/02495_sum_if_to_count_if_bug.reference
@@ -1,3 +1,3 @@
 1024
-0
+1024
 1024

From 218591ee4a27d894e49ded6154a6e084f93c0b23 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 26 Apr 2024 10:04:29 +0000
Subject: [PATCH 053/192] Cosmetics

---
 .../sql-reference/functions/uuid-functions.md | 131 ++++++++++--------
 1 file changed, 71 insertions(+), 60 deletions(-)

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index 5f1246fc3ad..07232b9a7cb 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -8,7 +8,7 @@ sidebar_label: UUIDs
 
 ## generateUUIDv4
 
-Generates the [UUID](../data-types/uuid.md) of [version 4](https://tools.ietf.org/html/rfc4122#section-4.4).
+Generates a [UUID](../data-types/uuid.md) of [version 4](https://tools.ietf.org/html/rfc4122#section-4.4).
 
 **Syntax**
 
@@ -18,34 +18,35 @@ generateUUIDv4([x])
 
 **Arguments**
 
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
+- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
 
 **Returned value**
 
-The UUID type value.
+A value of type UUIDv4.
 
-**Usage example**
+**Example**
 
-This example demonstrates creating a table with the UUID type column and inserting a value into the table.
+First, create a table with a column of type UUID, then insert a generated UUIDv4 into the table.
 
 ``` sql
-CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+CREATE TABLE tab (uuid UUID) ENGINE = Memory;
 
-INSERT INTO t_uuid SELECT generateUUIDv4()
+INSERT INTO tab SELECT generateUUIDv4();
 
-SELECT * FROM t_uuid
+SELECT * FROM tab;
 ```
 
 ```response
-┌────────────────────────────────────x─┐
+┌─────────────────────────────────uuid─┐
 │ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │
 └──────────────────────────────────────┘
 ```
 
-**Usage example if it is needed to generate multiple values in one row**
+**Example where multiple UUIDs are generated per row**
 
 ```sql
-SELECT generateUUIDv4(1), generateUUIDv4(2)
+SELECT generateUUIDv4(1), generateUUIDv4(2);
+
 ┌─generateUUIDv4(1)────────────────────┬─generateUUIDv4(2)────────────────────┐
 │ 2d49dc6e-ddce-4cd0-afb8-790956df54c1 │ 8abf8c13-7dea-4fdf-af3e-0e18767770e6 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
@@ -53,7 +54,10 @@ SELECT generateUUIDv4(1), generateUUIDv4(2)
 
 ## generateUUIDv7
 
-Generates the [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). The generated UUID contains current timestamp in milliseconds followed by version 7 and variant 2 markers and random data in the following bit layout.
+Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
+
+The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits).
+
 ```
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@@ -76,34 +80,35 @@ generateUUIDv7([x])
 
 **Arguments**
 
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
+- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
 
 **Returned value**
 
-The UUID type value.
+A value of type UUIDv7.
 
-**Usage example**
+**Example**
 
-This example demonstrates creating a table with the UUID type column and inserting a UUIDv7 value into the table.
+First, create a table with a column of type UUID, then insert a generated UUIDv7 into the table.
 
 ``` sql
-CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+CREATE TABLE tab (uuid UUID) ENGINE = Memory;
 
-INSERT INTO t_uuid SELECT generateUUIDv7()
+INSERT INTO tab SELECT generateUUIDv7();
 
-SELECT * FROM t_uuid
+SELECT * FROM tab;
 ```
 
 ```response
-┌────────────────────────────────────x─┐
+┌─────────────────────────────────uuid─┐
 │ 018f05af-f4a8-778f-beee-1bedbc95c93b │
 └──────────────────────────────────────┘
 ```
 
-**Usage example if it is needed to generate multiple values in one row**
+**Example where multiple UUIDs are generated per row**
 
 ```sql
-SELECT generateUUIDv7(1), generateUUIDv7(2)
+SELECT generateUUIDv7(1), generateUUIDv7(2);
+
 ┌─generateUUIDv7(1)────────────────────┬─generateUUIDv7(2)────────────────────┐
 │ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
@@ -111,8 +116,11 @@ SELECT generateUUIDv7(1), generateUUIDv7(2)
 
 ## generateUUIDv7WithCounter
 
-Generates the [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
-The generated UUID contains current timestamp in milliseconds followed by version 7 and variant 2 markers, counter and random data in the following bit layout. At any given new timestamp in unix_ts_ms the counter starts from some random value and then it's being increased by 1 on each new UUID v7 with counter generation until current timestamp changes. The counter overflow causes unix_ts_ms field increment by 1 and the counter restart from a random value. Counter increment monotony at one timestamp is guaranteed across all `generateUUIDv7WithCounter` functions running simultaneously.
+Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
+
+The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits).
+At any given new timestamp in unix_ts_ms, the counter starts from some random value and then it's being increased by 1 on each new UUID v7 with counter generation until current timestamp changes.
+The counter overflow causes unix_ts_ms field increment by 1 and the counter restart from a random value. Counter increment monotony at one timestamp is guaranteed across all `generateUUIDv7WithCounter` functions running simultaneously.
 ```
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@@ -135,34 +143,35 @@ generateUUIDv7WithCounter([x])
 
 **Arguments**
 
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
+- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
 
 **Returned value**
 
-The UUID type value.
+A value of type UUIDv7.
 
 **Usage example**
 
-This example demonstrates creating a table with the UUID type column and inserting a UUIDv7 value into the table.
+First, create a table with a column of type UUID, then insert a generated UUIDv7 into the table.
 
 ``` sql
-CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+CREATE TABLE tab (uuid UUID) ENGINE = Memory;
 
-INSERT INTO t_uuid SELECT generateUUIDv7WithCounter()
+INSERT INTO tab SELECT generateUUIDv7WithCounter();
 
-SELECT * FROM t_uuid
+SELECT * FROM tab;
 ```
 
 ```response
-┌────────────────────────────────────x─┐
+┌─────────────────────────────────uuid─┐
 │ 018f05c7-56e3-7ac3-93e9-1d93c4218e0e │
 └──────────────────────────────────────┘
 ```
 
-**Usage example if it is needed to generate multiple values in one row**
+**Example where multiple UUIDs are generated per row**
 
 ```sql
-SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2)
+SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2);
+
 ┌─generateUUIDv7WithCounter(1)─────────┬─generateUUIDv7WithCounter(2)─────────┐
 │ 018f05c9-4ab8-7b86-b64e-c9f03fbd45d1 │ 018f05c9-4ab8-7b86-b64e-c9f12efb7e16 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
@@ -170,8 +179,9 @@ SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2)
 
 ## generateUUIDv7WithFastCounter
 
-Generates the [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
-This function is a faster version of `generateUUIDv7WithCounter` function giving no guarantee on counter monotony across different requests running simultaneously. Counter increment monotony at one timestamp is guaranteed only within one thread calling this function to generate many UUIDs.
+Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
+
+This function behaves like `generateUUIDv7WithCounter` but gives no guarantee on counter monotony across different requests running simultaneously. Counter increment monotony at one timestamp is guaranteed only within one thread calling this function to generate many UUIDs.
 
 **Syntax**
 
@@ -185,30 +195,31 @@ generateUUIDv7WithFastCounter([x])
 
 **Returned value**
 
-The UUID type value.
+A value of type UUIDv7.
 
 **Usage example**
 
-This example demonstrates creating a table with the UUID type column and inserting a UUIDv7 value into the table.
+First, create a table with a column of type UUID, then insert a generated UUIDv7 into the table.
 
 ``` sql
-CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+CREATE TABLE tab (uuid UUID) ENGINE = Memory;
 
-INSERT INTO t_uuid SELECT generateUUIDv7WithFastCounter()
+INSERT INTO tab SELECT generateUUIDv7WithFastCounter();
 
-SELECT * FROM t_uuid
+SELECT * FROM tab;
 ```
 
 ```response
-┌────────────────────────────────────x─┐
+┌─────────────────────────────────uuid─┐
 │ 018f05e2-e3b2-70cb-b8be-64b09b626d32 │
 └──────────────────────────────────────┘
 ```
 
-**Usage example if it is needed to generate multiple values in one row**
+**Example where multiple UUIDs are generated per row**
 
 ```sql
-SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(2)
+SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(2);
+
 ┌─generateUUIDv7WithFastCounter(1)─────┬─generateUUIDv7WithFastCounter(2)─────┐
 │ 018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
@@ -226,15 +237,15 @@ empty(UUID)
 
 The UUID is considered empty if it contains all zeros (zero UUID).
 
-The function also works for [arrays](array-functions.md#function-empty) or [strings](string-functions.md#empty).
+The function also works for [Arrays](array-functions.md#function-empty) and [Strings](string-functions.md#empty).
 
 **Arguments**
 
-- `x` — Input UUID. [UUID](../data-types/uuid.md).
+- `x` — A UUID. [UUID](../data-types/uuid.md).
 
 **Returned value**
 
-- Returns `1` for an empty UUID or `0` for a non-empty UUID. 
+- Returns `1` for an empty UUID or `0` for a non-empty UUID.
 
 Type: [UInt8](../data-types/int-uint.md).
 
@@ -268,15 +279,15 @@ notEmpty(UUID)
 
 The UUID is considered empty if it contains all zeros (zero UUID).
 
-The function also works for [arrays](array-functions.md#function-notempty) or [strings](string-functions.md#notempty).
+The function also works for [Arrays](array-functions.md#function-notempty) or [Strings](string-functions.md#notempty).
 
 **Arguments**
 
-- `x` — Input UUID. [UUID](../data-types/uuid.md).
+- `x` — A UUID. [UUID](../data-types/uuid.md).
 
 **Returned value**
 
-- Returns `1` for a non-empty UUID or `0` for an empty UUID. 
+- Returns `1` for a non-empty UUID or `0` for an empty UUID.
 
 Type: [UInt8](../data-types/int-uint.md).
 
@@ -298,12 +309,12 @@ Result:
 └────────────────────────────┘
 ```
 
-## toUUID (x)
+## toUUID
 
-Converts String type value to UUID type.
+Converts a value of type String to a UUID.
 
 ``` sql
-toUUID(String)
+toUUID(string)
 ```
 
 **Returned value**
@@ -322,7 +333,7 @@ SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid
 └──────────────────────────────────────┘
 ```
 
-## toUUIDOrDefault (x,y)
+## toUUIDOrDefault
 
 **Arguments**
 
@@ -334,7 +345,7 @@ SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid
 UUID
 
 ``` sql
-toUUIDOrDefault(String, UUID)
+toUUIDOrDefault(string, default)
 ```
 
 **Returned value**
@@ -366,12 +377,12 @@ SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c4
 └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
 ```
 
-## toUUIDOrNull (x)
+## toUUIDOrNull
 
-It takes an argument of type String and tries to parse it into UUID. If failed, returns NULL.
+Takes an argument of type String and tries to parse it into UUID. If failed, returns NULL.
 
 ``` sql
-toUUIDOrNull(String)
+toUUIDOrNull(string)
 ```
 
 **Returned value**
@@ -390,12 +401,12 @@ SELECT toUUIDOrNull('61f0c404-5cb3-11e7-907b-a6006ad3dba0T') AS uuid
 └──────┘
 ```
 
-## toUUIDOrZero (x)
+## toUUIDOrZero
 
 It takes an argument of type String and tries to parse it into UUID. If failed, returns zero UUID.
 
 ``` sql
-toUUIDOrZero(String)
+toUUIDOrZero(string)
 ```
 
 **Returned value**
@@ -426,7 +437,7 @@ UUIDStringToNum(string[, variant = 1])
 
 **Arguments**
 
-- `string` — String of 36 characters or FixedString(36). [String](../../sql-reference/syntax.md#syntax-string-literal).
+- `string` — A [String](../../sql-reference/syntax.md#syntax-string-literal) of 36 characters or [FixedString](../../sql-reference/syntax.md#syntax-string-literal)
 - `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`.
 
 **Returned value**

From c3be6728c84823c643f7755f10fda54b4d8f51bf Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 26 Apr 2024 10:12:16 +0000
Subject: [PATCH 054/192] Cosmetics, pt. II

---
 src/Functions/generateUUIDv7.cpp | 366 +++++++++++++++----------------
 1 file changed, 183 insertions(+), 183 deletions(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 101fe4a279d..b6883b1d555 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -7,13 +7,13 @@ namespace DB
 
 namespace ErrorCodes
 {
-extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }
 
 namespace
 {
 constexpr auto bits_in_counter = 42;
-constexpr uint64_t counter_limit = (uint64_t{1} << bits_in_counter);
+constexpr uint64_t counter_limit = (1ull << bits_in_counter);
 constexpr uint8_t random_data_offset = 6;
 constexpr uint8_t random_data_count = 10;
 constexpr uint8_t next_count_random_data_offset = 12;
@@ -21,7 +21,7 @@ constexpr uint8_t next_count_random_data_count = 4;
 
 using UUIDAsArray = std::array<uint8_t, 16>;
 
-inline uint64_t getTimestampMs()
+uint64_t getTimestampMs()
 {
     timespec tp;
     clock_gettime(CLOCK_REALTIME, &tp);
@@ -29,7 +29,7 @@ inline uint64_t getTimestampMs()
     return sec * 1000 + tp.tv_nsec / 1000000;
 }
 
-inline void fillTimestamp(UUIDAsArray & uuid, uint64_t timestamp)
+void fillTimestamp(UUIDAsArray & uuid, uint64_t timestamp)
 {
     uuid[0] = (timestamp >> 40) & 0xFF;
     uuid[1] = (timestamp >> 32) & 0xFF;
@@ -46,215 +46,215 @@ inline void fillTimestamp(UUIDAsArray & uuid, uint64_t timestamp)
 
 DECLARE_SEVERAL_IMPLEMENTATIONS(
 
-    namespace UUIDv7Impl
+namespace UUIDv7Impl
+{
+    void store(UUID & new_uuid, UUIDAsArray & uuid)
     {
-        inline void store(UUID & new_uuid, UUIDAsArray & uuid)
-        {
-            uuid[6] = (uuid[6] & 0x0f) | 0x70; // version 7
-            uuid[8] = (uuid[8] & 0x3f) | 0x80; // variant 2
+        uuid[6] = (uuid[6] & 0x0f) | 0x70; // version 7
+        uuid[8] = (uuid[8] & 0x3f) | 0x80; // variant 2
 
-            DB::UUIDHelpers::getHighBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data());
-            DB::UUIDHelpers::getLowBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data() + 8);
+        DB::UUIDHelpers::getHighBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data());
+        DB::UUIDHelpers::getLowBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data() + 8);
+    }
+
+    struct UUIDv7Base
+    {
+        UUIDAsArray & uuid;
+        explicit UUIDv7Base(UUIDAsArray & u) : uuid(u) { }
+    };
+
+    struct RandomData
+    {
+        static constexpr auto name = "generateUUIDv7";
+        struct Data : UUIDv7Base
+        {
+            UUIDAsArray uuid_data;
+
+            Data() : UUIDv7Base(uuid_data) { }
+
+            void generate(UUID & new_uuid)
+            {
+                fillTimestamp(uuid, getTimestampMs());
+                memcpy(uuid.data() + random_data_offset, &new_uuid, random_data_count);
+                store(new_uuid, uuid);
+            }
+        };
+    };
+
+    struct CounterDataCommon : UUIDv7Base
+    {
+        explicit CounterDataCommon(UUIDAsArray & u) : UUIDv7Base(u) { }
+
+        uint64_t getCounter()
+        {
+            uint64_t counter = uuid[6] & 0x0f;
+            counter = (counter << 8) | uuid[7];
+            counter = (counter << 6) | (uuid[8] & 0x3f);
+            counter = (counter << 8) | uuid[9];
+            counter = (counter << 8) | uuid[10];
+            counter = (counter << 8) | uuid[11];
+            return counter;
         }
 
-        struct UUIDv7Base
+        void generate(UUID & newUUID)
         {
-            UUIDAsArray & uuid;
-            explicit UUIDv7Base(UUIDAsArray & u) : uuid(u) { }
-        };
-
-        struct RandomData
-        {
-            static constexpr auto name = "generateUUIDv7";
-            struct Data : UUIDv7Base
+            uint64_t timestamp = 0;
+            /// Get timestamp of the previous uuid
+            for (int i = 0; i != 6; ++i)
             {
-                UUIDAsArray uuid_data;
-
-                Data() : UUIDv7Base(uuid_data) { }
-
-                void generate(UUID & new_uuid)
-                {
-                    fillTimestamp(uuid, getTimestampMs());
-                    memcpy(uuid.data() + random_data_offset, &new_uuid, random_data_count);
-                    store(new_uuid, uuid);
-                }
-            };
-        };
-
-        struct CounterDataCommon : UUIDv7Base
-        {
-            explicit CounterDataCommon(UUIDAsArray & u) : UUIDv7Base(u) { }
-
-            uint64_t getCounter()
-            {
-                uint64_t counter = uuid[6] & 0x0f;
-                counter = (counter << 8) | uuid[7];
-                counter = (counter << 6) | (uuid[8] & 0x3f);
-                counter = (counter << 8) | uuid[9];
-                counter = (counter << 8) | uuid[10];
-                counter = (counter << 8) | uuid[11];
-                return counter;
+                timestamp = (timestamp << 8) | uuid[i];
             }
 
-            void generate(UUID & newUUID)
+            const uint64_t unix_time_ms = getTimestampMs();
+            // continue incrementing counter when clock slightly goes back or when counter overflow happened during the previous UUID generation
+            bool need_to_increment_counter = (timestamp == unix_time_ms || timestamp < unix_time_ms + 10000);
+            uint64_t counter = 0;
+            if (need_to_increment_counter)
             {
-                uint64_t timestamp = 0;
-                /// Get timestamp of the previous uuid
-                for (int i = 0; i != 6; ++i)
-                {
-                    timestamp = (timestamp << 8) | uuid[i];
-                }
+                counter = getCounter();
+            }
+            else
+            {
+                timestamp = unix_time_ms;
+            }
 
-                const uint64_t unix_time_ms = getTimestampMs();
-                // continue incrementing counter when clock slightly goes back or when counter overflow happened during the previous UUID generation
-                bool need_to_increment_counter = (timestamp == unix_time_ms || timestamp < unix_time_ms + 10000);
-                uint64_t counter = 0;
-                if (need_to_increment_counter)
+            bool counter_incremented = false;
+            if (need_to_increment_counter)
+            {
+                if (++counter == counter_limit)
                 {
-                    counter = getCounter();
+                    ++timestamp;
+                    // counter bytes will be filled by the random data
                 }
                 else
                 {
-                    timestamp = unix_time_ms;
+                    uuid[6] = counter >> 38;
+                    uuid[7] = counter >> 30;
+                    uuid[8] = counter >> 24;
+                    uuid[9] = counter >> 16;
+                    uuid[10] = counter >> 8;
+                    uuid[11] = counter;
+                    counter_incremented = true;
                 }
-
-                bool counter_incremented = false;
-                if (need_to_increment_counter)
-                {
-                    if (++counter == counter_limit)
-                    {
-                        ++timestamp;
-                        // counter bytes will be filled by the random data
-                    }
-                    else
-                    {
-                        uuid[6] = counter >> 38;
-                        uuid[7] = counter >> 30;
-                        uuid[8] = counter >> 24;
-                        uuid[9] = counter >> 16;
-                        uuid[10] = counter >> 8;
-                        uuid[11] = counter;
-                        counter_incremented = true;
-                    }
-                }
-
-                fillTimestamp(uuid, timestamp);
-
-                // Get the required number of random bytes: 4 in the case of incrementing existing counter, 10 in the case of renewing counter
-                memcpy(
-                    uuid.data() + (counter_incremented ? next_count_random_data_offset : random_data_offset),
-                    &newUUID,
-                    counter_incremented ? next_count_random_data_count : random_data_count);
-
-                store(newUUID, uuid);
             }
-        };
 
-        struct ThreadLocalCounter
+            fillTimestamp(uuid, timestamp);
+
+            // Get the required number of random bytes: 4 in the case of incrementing existing counter, 10 in the case of renewing counter
+            memcpy(
+                uuid.data() + (counter_incremented ? next_count_random_data_offset : random_data_offset),
+                &newUUID,
+                counter_incremented ? next_count_random_data_count : random_data_count);
+
+            store(newUUID, uuid);
+        }
+    };
+
+    struct ThreadLocalCounter
+    {
+        static constexpr auto name = "generateUUIDv7WithFastCounter";
+        struct Data : CounterDataCommon
         {
-            static constexpr auto name = "generateUUIDv7WithFastCounter";
-            struct Data : CounterDataCommon
-            {
-                // Implement counter monotony only within one thread so function doesn't require mutexes and doesn't affect performance of the same function running simultenaously on other threads
-                static inline thread_local UUIDAsArray uuid_data;
-                Data() : CounterDataCommon(uuid_data) { }
-            };
+            // Implement counter monotony only within one thread so function doesn't require mutexes and doesn't affect performance of the same function running simultenaously on other threads
+            static inline thread_local UUIDAsArray uuid_data;
+            Data() : CounterDataCommon(uuid_data) { }
         };
+    };
 
-        struct GlobalCounter
+    struct GlobalCounter
+    {
+        static constexpr auto name = "generateUUIDv7WithCounter";
+        struct Data : std::lock_guard<std::mutex>, CounterDataCommon
         {
-            static constexpr auto name = "generateUUIDv7WithCounter";
-            struct Data : std::lock_guard<std::mutex>, CounterDataCommon
-            {
-                // Implement counter monotony within one timestamp across all threads generating UUIDv7 with counter simultaneously
-                static inline UUIDAsArray uuid_data;
-                static inline std::mutex mtx;
-                Data() : std::lock_guard<std::mutex>(mtx), CounterDataCommon(uuid_data) { }
-            };
+            // Implement counter monotony within one timestamp across all threads generating UUIDv7 with counter simultaneously
+            static inline UUIDAsArray uuid_data;
+            static inline std::mutex mtx;
+            Data() : std::lock_guard<std::mutex>(mtx), CounterDataCommon(uuid_data) { }
         };
-    }
+    };
+}
 
-    template <typename FillPolicy>
-    class FunctionGenerateUUIDv7Base
-    : public IFunction,
-      public FillPolicy {
-      public:
-          using FillPolicy::name;
-          using FillPolicyData = typename FillPolicy::Data;
+template <typename FillPolicy>
+class FunctionGenerateUUIDv7Base
+: public IFunction,
+  public FillPolicy {
+  public:
+      using FillPolicy::name;
+      using FillPolicyData = typename FillPolicy::Data;
 
-          FunctionGenerateUUIDv7Base() = default;
+      FunctionGenerateUUIDv7Base() = default;
 
-          String getName() const final
+      String getName() const final
+      {
+          return name;
+      }
+
+      size_t getNumberOfArguments() const final
+      {
+          return 0;
+      }
+
+      bool isDeterministicInScopeOfQuery() const final
+      {
+          return false;
+      }
+      bool useDefaultImplementationForNulls() const final
+      {
+          return false;
+      }
+      bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final
+      {
+          return false;
+      }
+      bool isVariadic() const final
+      {
+          return true;
+      }
+
+      DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+      {
+          if (arguments.size() > 1)
+              throw Exception(
+                  ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                  "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
+                  getName(),
+                  arguments.size());
+
+          return std::make_shared<DataTypeUUID>();
+      }
+
+      bool isDeterministic() const override
+      {
+          return false;
+      }
+
+      ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
+      {
+          auto col_res = ColumnVector<UUID>::create();
+          typename ColumnVector<UUID>::Container & vec_to = col_res->getData();
+
+          size_t size = input_rows_count;
+          vec_to.resize(size);
+
+          /// RandImpl is target-dependent and is not the same in different TargetSpecific namespaces.
+          /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl
+          RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
+
+          for (UUID & new_uuid : vec_to)
           {
-              return name;
+              FillPolicyData data;
+              data.generate(new_uuid);
           }
 
-          size_t getNumberOfArguments() const final
-          {
-              return 0;
-          }
+          return col_res;
+      }
+  };
 
-          bool isDeterministicInScopeOfQuery() const final
-          {
-              return false;
-          }
-          bool useDefaultImplementationForNulls() const final
-          {
-              return false;
-          }
-          bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final
-          {
-              return false;
-          }
-          bool isVariadic() const final
-          {
-              return true;
-          }
+using FunctionGenerateUUIDv7 = FunctionGenerateUUIDv7Base<UUIDv7Impl::RandomData>;
+using FunctionGenerateUUIDv7WithCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::GlobalCounter>;
+using FunctionGenerateUUIDv7WithFastCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::ThreadLocalCounter>;
 
-          DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
-          {
-              if (arguments.size() > 1)
-                  throw Exception(
-                      ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
-                      "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
-                      getName(),
-                      arguments.size());
-
-              return std::make_shared<DataTypeUUID>();
-          }
-
-          bool isDeterministic() const override
-          {
-              return false;
-          }
-
-          ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
-          {
-              auto col_res = ColumnVector<UUID>::create();
-              typename ColumnVector<UUID>::Container & vec_to = col_res->getData();
-
-              size_t size = input_rows_count;
-              vec_to.resize(size);
-
-              /// RandImpl is target-dependent and is not the same in different TargetSpecific namespaces.
-              /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl
-              RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
-
-              for (UUID & new_uuid : vec_to)
-              {
-                  FillPolicyData data;
-                  data.generate(new_uuid);
-              }
-
-              return col_res;
-          }
-      };
-
-    using FunctionGenerateUUIDv7 = FunctionGenerateUUIDv7Base<UUIDv7Impl::RandomData>;
-    using FunctionGenerateUUIDv7WithCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::GlobalCounter>;
-    using FunctionGenerateUUIDv7WithFastCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::ThreadLocalCounter>;
-
-    ) // DECLARE_SEVERAL_IMPLEMENTATIONS
+) // DECLARE_SEVERAL_IMPLEMENTATIONS
 #undef DECLARE_SEVERAL_IMPLEMENTATIONS
 
 

From c68a96eec34d45d19ab143cc45eac4e2e2f2dae7 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 26 Apr 2024 10:21:44 +0000
Subject: [PATCH 055/192] Cosmetics, pt. III

---
 src/Functions/generateUUIDv4.cpp |  9 +----
 src/Functions/generateUUIDv7.cpp | 66 +++++++++++---------------------
 2 files changed, 24 insertions(+), 51 deletions(-)

diff --git a/src/Functions/generateUUIDv4.cpp b/src/Functions/generateUUIDv4.cpp
index e70c2e17595..61546d35069 100644
--- a/src/Functions/generateUUIDv4.cpp
+++ b/src/Functions/generateUUIDv4.cpp
@@ -21,13 +21,10 @@ class FunctionGenerateUUIDv4 : public IFunction
 public:
     static constexpr auto name = "generateUUIDv4";
 
-    String getName() const override
-    {
-        return name;
-    }
+    String getName() const override { return name; }
 
     size_t getNumberOfArguments() const override { return 0; }
-
+    bool isDeterministic() const override { return false; }
     bool isDeterministicInScopeOfQuery() const override { return false; }
     bool useDefaultImplementationForNulls() const override { return false; }
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
@@ -43,8 +40,6 @@ public:
         return std::make_shared<DataTypeUUID>();
     }
 
-    bool isDeterministic() const override { return false; }
-
     ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
     {
         auto col_res = ColumnVector<UUID>::create();
diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index b6883b1d555..5da0be2b998 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -83,7 +83,9 @@ namespace UUIDv7Impl
 
     struct CounterDataCommon : UUIDv7Base
     {
-        explicit CounterDataCommon(UUIDAsArray & u) : UUIDv7Base(u) { }
+        explicit CounterDataCommon(UUIDAsArray & u)
+            : UUIDv7Base(u)
+        {}
 
         uint64_t getCounter()
         {
@@ -101,22 +103,16 @@ namespace UUIDv7Impl
             uint64_t timestamp = 0;
             /// Get timestamp of the previous uuid
             for (int i = 0; i != 6; ++i)
-            {
                 timestamp = (timestamp << 8) | uuid[i];
-            }
 
             const uint64_t unix_time_ms = getTimestampMs();
             // continue incrementing counter when clock slightly goes back or when counter overflow happened during the previous UUID generation
             bool need_to_increment_counter = (timestamp == unix_time_ms || timestamp < unix_time_ms + 10000);
             uint64_t counter = 0;
             if (need_to_increment_counter)
-            {
                 counter = getCounter();
-            }
             else
-            {
                 timestamp = unix_time_ms;
-            }
 
             bool counter_incremented = false;
             if (need_to_increment_counter)
@@ -157,7 +153,10 @@ namespace UUIDv7Impl
         {
             // Implement counter monotony only within one thread so function doesn't require mutexes and doesn't affect performance of the same function running simultenaously on other threads
             static inline thread_local UUIDAsArray uuid_data;
-            Data() : CounterDataCommon(uuid_data) { }
+
+            Data()
+                : CounterDataCommon(uuid_data)
+            {}
         };
     };
 
@@ -169,47 +168,32 @@ namespace UUIDv7Impl
             // Implement counter monotony within one timestamp across all threads generating UUIDv7 with counter simultaneously
             static inline UUIDAsArray uuid_data;
             static inline std::mutex mtx;
-            Data() : std::lock_guard<std::mutex>(mtx), CounterDataCommon(uuid_data) { }
+
+            Data()
+                : std::lock_guard<std::mutex>(mtx)
+                , CounterDataCommon(uuid_data)
+            {}
         };
     };
 }
 
 template <typename FillPolicy>
-class FunctionGenerateUUIDv7Base
-: public IFunction,
-  public FillPolicy {
+class FunctionGenerateUUIDv7Base : public IFunction, public FillPolicy
+{
   public:
       using FillPolicy::name;
       using FillPolicyData = typename FillPolicy::Data;
 
       FunctionGenerateUUIDv7Base() = default;
 
-      String getName() const final
-      {
-          return name;
-      }
+      String getName() const final { return name; }
 
-      size_t getNumberOfArguments() const final
-      {
-          return 0;
-      }
-
-      bool isDeterministicInScopeOfQuery() const final
-      {
-          return false;
-      }
-      bool useDefaultImplementationForNulls() const final
-      {
-          return false;
-      }
-      bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final
-      {
-          return false;
-      }
-      bool isVariadic() const final
-      {
-          return true;
-      }
+      size_t getNumberOfArguments() const final { return 0; }
+      bool isDeterministic() const override { return false; }
+      bool isDeterministicInScopeOfQuery() const final { return false; }
+      bool useDefaultImplementationForNulls() const final { return false; }
+      bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final { return false; }
+      bool isVariadic() const final { return true; }
 
       DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
       {
@@ -217,17 +201,11 @@ class FunctionGenerateUUIDv7Base
               throw Exception(
                   ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                   "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
-                  getName(),
-                  arguments.size());
+                  getName(), arguments.size());
 
           return std::make_shared<DataTypeUUID>();
       }
 
-      bool isDeterministic() const override
-      {
-          return false;
-      }
-
       ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
       {
           auto col_res = ColumnVector<UUID>::create();

From 466633729f72fc3418bfd9d0c87598428c048d5c Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Fri, 26 Apr 2024 17:03:40 +0200
Subject: [PATCH 056/192] Support user expiration

---
 src/Common/ErrorCodes.cpp    |  1 +
 src/Interpreters/Session.cpp | 12 ++++++++++++
 src/Interpreters/Session.h   |  4 ++++
 src/Server/TCPHandler.cpp    | 11 +++++++++--
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index 97a339b2bac..1e356b34306 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -598,6 +598,7 @@
     M(717, EXPERIMENTAL_FEATURE_ERROR) \
     M(718, TOO_SLOW_PARSING) \
     M(719, QUERY_CACHE_USED_WITH_SYSTEM_TABLE) \
+    M(720, USER_EXPIRED) \
     \
     M(900, DISTRIBUTED_CACHE_ERROR) \
     M(901, CANNOT_USE_DISTRIBUTED_CACHE) \
diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp
index 9f64380ab43..90ea6aa336d 100644
--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@@ -32,6 +32,7 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
     extern const int SESSION_NOT_FOUND;
     extern const int SESSION_IS_LOCKED;
+    extern const int USER_EXPIRED;
 }
 
 
@@ -365,6 +366,17 @@ void Session::authenticate(const Credentials & credentials_, const Poco::Net::So
     prepared_client_info->current_address = address;
 }
 
+void Session::isUserStillValid()
+{
+    if (user->valid_until)
+    {
+        const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+
+        if (now > user->valid_until)
+            throw Exception(ErrorCodes::USER_EXPIRED, "User expired");
+    }
+}
+
 void Session::onAuthenticationFailure(const std::optional<String> & user_name, const Poco::Net::SocketAddress & address_, const Exception & e)
 {
     LOG_DEBUG(log, "{} Authentication failed with error: {}", toString(auth_id), e.what());
diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index e6cb7ca20cd..371f99f305f 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -53,6 +53,10 @@ public:
     void authenticate(const String & user_name, const String & password, const Poco::Net::SocketAddress & address);
     void authenticate(const Credentials & credentials_, const Poco::Net::SocketAddress & address_);
 
+    // Checks if user valid_until is less than current time.
+    // Throws exception otherwise.
+    void isUserStillValid();
+
     /// Writes a row about login failure into session log (if enabled)
     void onAuthenticationFailure(const std::optional<String> & user_name, const Poco::Net::SocketAddress & address_, const Exception & e);
 
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 4e3d6ab69f6..0116d97abf6 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -100,6 +100,7 @@ namespace DB::ErrorCodes
     extern const int TIMEOUT_EXCEEDED;
     extern const int SUPPORT_IS_DISABLED;
     extern const int UNSUPPORTED_METHOD;
+    extern const int USER_EXPIRED;
 }
 
 namespace
@@ -347,6 +348,7 @@ void TCPHandler::runImpl()
          */
         std::unique_ptr<DB::Exception> exception;
         bool network_error = false;
+        bool close_connection = false;
         bool query_duration_already_logged = false;
         auto log_query_duration = [this, &query_duration_already_logged]()
         {
@@ -413,6 +415,8 @@ void TCPHandler::runImpl()
                 CurrentThread::attachInternalProfileEventsQueue(state.profile_queue);
             }
 
+            session->isUserStillValid();
+
             query_context->setExternalTablesInitializer([this] (ContextPtr context)
             {
                 if (context != query_context)
@@ -637,7 +641,10 @@ void TCPHandler::runImpl()
             if (e.code() == ErrorCodes::SOCKET_TIMEOUT)
                 network_error = true;
 
-            if (network_error)
+            if (e.code() == ErrorCodes::USER_EXPIRED)
+                close_connection = true;
+
+            if (network_error || close_connection)
                 LOG_TEST(log, "Going to close connection due to exception: {}", e.message());
         }
         catch (const Poco::Net::NetException & e)
@@ -747,7 +754,7 @@ void TCPHandler::runImpl()
             session.reset();
         }
 
-        if (network_error)
+        if (network_error || close_connection)
             break;
     }
 }

From 48213c2c3ef2ac4cc9c06e3f76268862a5ba0859 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nk@clickhouse.com>
Date: Fri, 26 Apr 2024 16:19:51 +0000
Subject: [PATCH 057/192] Do not remove server constants from GROUP BY key for
 secondary query.

---
 src/Planner/PlannerExpressionAnalysis.cpp                   | 6 ++++--
 .../03095_group_by_server_constants_bug.reference           | 1 +
 .../0_stateless/03095_group_by_server_constants_bug.sql     | 5 +++++
 3 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/03095_group_by_server_constants_bug.reference
 create mode 100644 tests/queries/0_stateless/03095_group_by_server_constants_bug.sql

diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp
index dd3769ee10b..19ad33b5a3f 100644
--- a/src/Planner/PlannerExpressionAnalysis.cpp
+++ b/src/Planner/PlannerExpressionAnalysis.cpp
@@ -85,6 +85,8 @@ std::optional<AggregationAnalysisResult> analyzeAggregation(const QueryTreeNodeP
     bool group_by_use_nulls = planner_context->getQueryContext()->getSettingsRef().group_by_use_nulls &&
         (query_node.isGroupByWithGroupingSets() || query_node.isGroupByWithRollup() || query_node.isGroupByWithCube());
 
+    bool is_secondary_query = planner_context->getQueryContext()->getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY;
+
     if (query_node.hasGroupBy())
     {
         if (query_node.isGroupByWithGroupingSets())
@@ -100,7 +102,7 @@ std::optional<AggregationAnalysisResult> analyzeAggregation(const QueryTreeNodeP
                     auto is_constant_key = grouping_set_key_node->as<ConstantNode>() != nullptr;
                     group_by_with_constant_keys |= is_constant_key;
 
-                    if (is_constant_key && !aggregates_descriptions.empty())
+                    if (!is_secondary_query && is_constant_key && !aggregates_descriptions.empty())
                         continue;
 
                     auto expression_dag_nodes = actions_visitor.visit(before_aggregation_actions, grouping_set_key_node);
@@ -152,7 +154,7 @@ std::optional<AggregationAnalysisResult> analyzeAggregation(const QueryTreeNodeP
                 auto is_constant_key = group_by_key_node->as<ConstantNode>() != nullptr;
                 group_by_with_constant_keys |= is_constant_key;
 
-                if (is_constant_key && !aggregates_descriptions.empty())
+                if (!is_secondary_query && is_constant_key && !aggregates_descriptions.empty())
                     continue;
 
                 auto expression_dag_nodes = actions_visitor.visit(before_aggregation_actions, group_by_key_node);
diff --git a/tests/queries/0_stateless/03095_group_by_server_constants_bug.reference b/tests/queries/0_stateless/03095_group_by_server_constants_bug.reference
new file mode 100644
index 00000000000..80ab3c879bb
--- /dev/null
+++ b/tests/queries/0_stateless/03095_group_by_server_constants_bug.reference
@@ -0,0 +1 @@
+r1	2
diff --git a/tests/queries/0_stateless/03095_group_by_server_constants_bug.sql b/tests/queries/0_stateless/03095_group_by_server_constants_bug.sql
new file mode 100644
index 00000000000..9f9fda1ef62
--- /dev/null
+++ b/tests/queries/0_stateless/03095_group_by_server_constants_bug.sql
@@ -0,0 +1,5 @@
+SELECT serverUUID() AS s, count() FROM remote('127.0.0.{1,2}', system.one) GROUP BY s format Null;
+
+select getMacro('replica') as s, count() from remote('127.0.0.{1,2}', system.one) group by s;
+
+select uptime() as s, count() FROM remote('127.0.0.{1,2}', system.one) group by s format Null;

From 822a57cb83067dba169502872192259634f83bd8 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Fri, 26 Apr 2024 19:56:24 +0200
Subject: [PATCH 058/192] Fix integration tests

---
 src/Interpreters/Session.cpp | 2 +-
 src/Interpreters/Session.h   | 2 +-
 src/Server/TCPHandler.cpp    | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp
index 90ea6aa336d..527efd73420 100644
--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@@ -368,7 +368,7 @@ void Session::authenticate(const Credentials & credentials_, const Poco::Net::So
 
 void Session::isUserStillValid()
 {
-    if (user->valid_until)
+    if (user && user->valid_until)
     {
         const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
 
diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index 371f99f305f..fbee9f1b413 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -54,7 +54,7 @@ public:
     void authenticate(const Credentials & credentials_, const Poco::Net::SocketAddress & address_);
 
     // Checks if user valid_until is less than current time.
-    // Throws exception otherwise.
+    // Throws exception if valid_until is higher than current time.
     void isUserStillValid();
 
     /// Writes a row about login failure into session log (if enabled)
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 0116d97abf6..6cc3cde4de9 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -415,7 +415,8 @@ void TCPHandler::runImpl()
                 CurrentThread::attachInternalProfileEventsQueue(state.profile_queue);
             }
 
-            session->isUserStillValid();
+            if (!is_interserver_mode)
+                session->isUserStillValid();
 
             query_context->setExternalTablesInitializer([this] (ContextPtr context)
             {

From 2768b4f611e72f8e34003f6e17f86b02366bed7a Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Fri, 26 Apr 2024 19:58:34 +0200
Subject: [PATCH 059/192] Fix comment

---
 src/Interpreters/Session.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index fbee9f1b413..7b65876c1eb 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -53,8 +53,8 @@ public:
     void authenticate(const String & user_name, const String & password, const Poco::Net::SocketAddress & address);
     void authenticate(const Credentials & credentials_, const Poco::Net::SocketAddress & address_);
 
-    // Checks if user valid_until is less than current time.
-    // Throws exception if valid_until is higher than current time.
+    // Checks if user valid_until is higher than current time.
+    // Throws exception if valid_until is less than current time.
     void isUserStillValid();
 
     /// Writes a row about login failure into session log (if enabled)

From 9235845988a55990e31b572412afb54801896fb9 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Sat, 27 Apr 2024 17:30:27 +0300
Subject: [PATCH 060/192] Performance optimizations + docs and tests changes
 (#4)

---
 .../sql-reference/functions/uuid-functions.md | 145 ++---
 .../sql-reference/functions/uuid-functions.md | 128 +++--
 src/Functions/FunctionsCodingUUID.cpp         |   1 -
 src/Functions/generateUUIDv7.cpp              | 523 ++++++++----------
 ...erate_multi_columns_with_uuid_v7.reference |   9 -
 ...10_generate_multi_columns_with_uuid_v7.sql |  17 -
 .../0_stateless/02310_uuid_v7.reference       |  21 +
 tests/queries/0_stateless/02310_uuid_v7.sql   |  23 +
 8 files changed, 426 insertions(+), 441 deletions(-)
 delete mode 100644 tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference
 delete mode 100644 tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql
 create mode 100644 tests/queries/0_stateless/02310_uuid_v7.reference
 create mode 100644 tests/queries/0_stateless/02310_uuid_v7.sql

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index 07232b9a7cb..3681f3d76b4 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -52,11 +52,17 @@ SELECT generateUUIDv4(1), generateUUIDv4(2);
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
-## generateUUIDv7
+## generateUUIDv7 {#uuidv7-function-generate}
 
 Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
 
 The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits).
+At any given new timestamp in unix_ts_ms, the counter starts from some random value and then it's being increased by 1 on each new UUID v7 with counter generation until current timestamp changes.
+The counter overflow causes unix_ts_ms field increment by 1 and the counter restart from a random value. Counter increment monotony at one timestamp is guaranteed across all `generateUUIDv7` functions running simultaneously.
+
+:::note
+As of April 2024 UUIDv7 is only a draft and the layout may change in future.
+:::
 
 ```
  0                   1                   2                   3
@@ -64,9 +70,9 @@ The generated UUID contains the current Unix timestamp in milliseconds (48 bits)
 ├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
 |                           unix_ts_ms                          |
 ├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|          unix_ts_ms           |  ver  |       rand_a          |
+|          unix_ts_ms           |  ver  |   counter_high_bits   |
 ├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|var|                        rand_b                             |
+|var|                   counter_low_bits                        |
 ├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
 |                            rand_b                             |
 └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
@@ -110,69 +116,6 @@ SELECT * FROM tab;
 SELECT generateUUIDv7(1), generateUUIDv7(2);
 
 ┌─generateUUIDv7(1)────────────────────┬─generateUUIDv7(2)────────────────────┐
-│ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
-└──────────────────────────────────────┴──────────────────────────────────────┘
-```
-
-## generateUUIDv7WithCounter
-
-Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
-
-The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits).
-At any given new timestamp in unix_ts_ms, the counter starts from some random value and then it's being increased by 1 on each new UUID v7 with counter generation until current timestamp changes.
-The counter overflow causes unix_ts_ms field increment by 1 and the counter restart from a random value. Counter increment monotony at one timestamp is guaranteed across all `generateUUIDv7WithCounter` functions running simultaneously.
-```
- 0                   1                   2                   3
- 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|                           unix_ts_ms                          |
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|          unix_ts_ms           |  ver  |   counter_high_bits   |
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|var|                   counter_low_bits                        |
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|                            rand_b                             |
-└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
-```
-
-**Syntax**
-
-``` sql
-generateUUIDv7WithCounter([x])
-```
-
-**Arguments**
-
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
-
-**Returned value**
-
-A value of type UUIDv7.
-
-**Usage example**
-
-First, create a table with a column of type UUID, then insert a generated UUIDv7 into the table.
-
-``` sql
-CREATE TABLE tab (uuid UUID) ENGINE = Memory;
-
-INSERT INTO tab SELECT generateUUIDv7WithCounter();
-
-SELECT * FROM tab;
-```
-
-```response
-┌─────────────────────────────────uuid─┐
-│ 018f05c7-56e3-7ac3-93e9-1d93c4218e0e │
-└──────────────────────────────────────┘
-```
-
-**Example where multiple UUIDs are generated per row**
-
-```sql
-SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2);
-
-┌─generateUUIDv7WithCounter(1)─────────┬─generateUUIDv7WithCounter(2)─────────┐
 │ 018f05c9-4ab8-7b86-b64e-c9f03fbd45d1 │ 018f05c9-4ab8-7b86-b64e-c9f12efb7e16 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
@@ -181,7 +124,7 @@ SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2);
 
 Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
 
-This function behaves like `generateUUIDv7WithCounter` but gives no guarantee on counter monotony across different requests running simultaneously. Counter increment monotony at one timestamp is guaranteed only within one thread calling this function to generate many UUIDs.
+This function behaves like [generateUUIDv7](#uuidv7-function-generate) but gives no guarantee on counter monotony across different requests running simultaneously. Counter increment monotony at one timestamp is guaranteed only within one thread calling this function to generate many UUIDs.
 
 **Syntax**
 
@@ -225,6 +168,74 @@ SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(2);
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
+
+## generateUUIDv7NonMonotonic
+
+Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
+
+The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits) and a random field (76 bits) (including a variant field "2", 2 bit).
+The monotonicity within one timestamp is not guaranteed at all. This is the fastest version of `generateUUIDv7*` functions family.
+
+:::note
+As of April 2024 UUIDv7 is only a draft and the layout may change in future.
+:::
+
+```
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |       rand_a          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                        rand_b                             |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+```
+
+**Syntax**
+
+``` sql
+generateUUIDv7NonMonotonic([x])
+```
+
+**Arguments**
+
+- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
+
+**Returned value**
+
+A value of type UUIDv7.
+
+**Example**
+
+First, create a table with a column of type UUID, then insert a generated UUIDv7 into the table.
+
+``` sql
+CREATE TABLE tab (uuid UUID) ENGINE = Memory;
+
+INSERT INTO tab SELECT generateUUIDv7NonMonotonic();
+
+SELECT * FROM tab;
+```
+
+```response
+┌─────────────────────────────────uuid─┐
+│ 018f05af-f4a8-778f-beee-1bedbc95c93b │
+└──────────────────────────────────────┘
+```
+
+**Example where multiple UUIDs are generated per row**
+
+```sql
+SELECT generateUUIDv7NonMonotonic(1), generateUUIDv7NonMonotonic(2);
+
+┌─generateUUIDv7NonMonotonic(1) ───────┬─generateUUIDv7(2)NonMonotonic────────┐
+│ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
 ## empty
 
 Checks whether the input UUID is empty.
diff --git a/docs/ru/sql-reference/functions/uuid-functions.md b/docs/ru/sql-reference/functions/uuid-functions.md
index 8f41d2ab6f4..f072d9d6863 100644
--- a/docs/ru/sql-reference/functions/uuid-functions.md
+++ b/docs/ru/sql-reference/functions/uuid-functions.md
@@ -53,65 +53,10 @@ SELECT generateUUIDv4(1), generateUUIDv4(2)
 
 ## generateUUIDv7 {#uuidv7-function-generate}
 
-Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, и случайных данных в следующей последовательности:
-```
- 0                   1                   2                   3
- 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|                           unix_ts_ms                          |
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|          unix_ts_ms           |  ver  |       rand_a          |
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|var|                        rand_b                             |
-├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
-|                            rand_b                             |
-└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
-```
-
-**Синтаксис**
-
-``` sql
-generateUUIDv7([x])
-```
-
-**Аргументы**
-
--   `x` — [выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр.
-   
-**Возвращаемое значение**
-
-Значение типа [UUID](../../sql-reference/functions/uuid-functions.md).
-
-**Пример использования**
-
-Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUIDv7.
-
-``` sql
-CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
-
-INSERT INTO t_uuid SELECT generateUUIDv7()
-
-SELECT * FROM t_uuid
-```
-
-``` text
-┌────────────────────────────────────x─┐
-│ 018f05af-f4a8-778f-beee-1bedbc95c93b │
-└──────────────────────────────────────┘
-```
-
-**Пример использования, для генерации нескольких значений в одной строке**
-
-```sql
-SELECT generateUUIDv7(1), generateUUIDv7(7)
-┌─generateUUIDv7(1)────────────────────┬─generateUUIDv7(2)────────────────────┐
-│ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
-└──────────────────────────────────────┴──────────────────────────────────────┘
-```
-
-## generateUUIDv7WithCounter {#uuidv7withcounter-function-generate}
-
 Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, монотонно возрастающего счётчика для данной временной метки и случайных данных в указанной ниже последовательности. Для каждой новой временной метки счётчик стартует с нового случайного значения, а для следующих UUIDv7 он увеличивается на единицу. В случае переполнения счётчика временная метка принудительно увеличивается на 1, и счётчик снова стартует со случайного значения. Монотонность возрастания счётчика для каждой временной метки гарантируется между всеми одновременно работающими функциями `generateUUIDv7WithCounter`.
+::::note
+На апрель 2024 года UUIDv7 находится в статусе черновика и его раскладка по битам может в итоге измениться.
+::::
 ```
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@@ -128,7 +73,7 @@ SELECT generateUUIDv7(1), generateUUIDv7(7)
 **Синтаксис**
 
 ``` sql
-generateUUIDv7WithCounter([x])
+generateUUIDv7([x])
 ```
 
 **Аргументы**
@@ -160,8 +105,8 @@ SELECT * FROM t_uuid
 **Пример использования, для генерации нескольких значений в одной строке**
 
 ```sql
-SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(7)
-┌─generateUUIDv7WithCounter(1)─────────┬─generateUUIDv7WithCounter(2)─────────┐
+SELECT generateUUIDv7(1), generateUUIDv7(2)
+┌─generateUUIDv7(1)────────────────────┬─generateUUIDv7(2)────────────────────┐
 │ 018f05c9-4ab8-7b86-b64e-c9f03fbd45d1 │ 018f05c9-4ab8-7b86-b64e-c9f12efb7e16 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
@@ -211,6 +156,67 @@ SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(7)
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
+## generateUUIDv7NonMonotonic {#uuidv7nonmonotonic-function-generate}
+
+Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, и случайных данных в следующей последовательности:
+::::note
+На апрель 2024 года UUIDv7 находится в статусе черновика и его раскладка по битам может в итоге измениться.
+::::
+```
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |       rand_a          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                        rand_b                             |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+```
+
+**Синтаксис**
+
+``` sql
+generateUUIDv7NonMonotonic([x])
+```
+
+**Аргументы**
+
+-   `x` — [выражение](../syntax.md#syntax-expressions), возвращающее значение одного из [поддерживаемых типов данных](../data-types/index.md#data_types). Значение используется, чтобы избежать [склейки одинаковых выражений](index.md#common-subexpression-elimination), если функция вызывается несколько раз в одном запросе. Необязательный параметр.
+   
+**Возвращаемое значение**
+
+Значение типа [UUID](../../sql-reference/functions/uuid-functions.md).
+
+**Пример использования**
+
+Этот пример демонстрирует, как создать таблицу с UUID-колонкой и добавить в нее сгенерированный UUIDv7.
+
+``` sql
+CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
+
+INSERT INTO t_uuid SELECT generateUUIDv7NonMonotonic()
+
+SELECT * FROM t_uuid
+```
+
+``` text
+┌────────────────────────────────────x─┐
+│ 018f05af-f4a8-778f-beee-1bedbc95c93b │
+└──────────────────────────────────────┘
+```
+
+**Пример использования, для генерации нескольких значений в одной строке**
+
+```sql
+SELECT generateUUIDv7(1), generateUUIDv7(7)
+┌─generateUUIDv7NonMonotonic(1)────────┬─generateUUIDv7NonMonotonic(2)────────┐
+│ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
+└──────────────────────────────────────┴──────────────────────────────────────┘
+```
+
 ## empty {#empty}
 
 Проверяет, является ли входной UUID пустым.
diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index 889eec866a8..a6fc9779ae1 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -408,7 +408,6 @@ public:
 
     String getName() const override { return name; }
     size_t getNumberOfArguments() const override { return 0; }
-    bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
     bool isVariadic() const override { return true; }
 
diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 5da0be2b998..0b5e1827cc5 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -7,21 +7,59 @@ namespace DB
 
 namespace ErrorCodes
 {
-    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }
 
 namespace
 {
-constexpr auto bits_in_counter = 42;
+
+/* Bit layouts of the UUIDv7
+
+without counter:
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |       rand_a          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                        rand_b                             |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+
+with counter:
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |   counter_high_bits   |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                   counter_low_bits                        |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+*/
+
+// bit counts
+constexpr auto rand_a_bits_count = 12;
+constexpr auto rand_b_bits_count = 62;
+constexpr auto rand_b_low_bits_count = 32;
+constexpr auto counter_high_bits_count = rand_a_bits_count;
+constexpr auto counter_low_bits_count = 30;
+constexpr auto bits_in_counter = counter_high_bits_count + counter_low_bits_count;
 constexpr uint64_t counter_limit = (1ull << bits_in_counter);
-constexpr uint8_t random_data_offset = 6;
-constexpr uint8_t random_data_count = 10;
-constexpr uint8_t next_count_random_data_offset = 12;
-constexpr uint8_t next_count_random_data_count = 4;
 
-using UUIDAsArray = std::array<uint8_t, 16>;
+// bit masks for UUIDv7 parts
+constexpr uint64_t variant_2_mask  = (2ull << rand_b_bits_count);
+constexpr uint64_t rand_a_bits_mask = (1ull << rand_a_bits_count) - 1;
+constexpr uint64_t rand_b_bits_mask = (1ull << rand_b_bits_count) - 1;
+constexpr uint64_t rand_b_with_counter_bits_mask = (1ull << rand_b_low_bits_count) - 1;
+constexpr uint64_t counter_low_bits_mask = (1ull << counter_low_bits_count) - 1;
+constexpr auto counter_high_bits_mask = rand_a_bits_mask;
 
-uint64_t getTimestampMs()
+inline uint64_t getTimestampMillisecond()
 {
     timespec tp;
     clock_gettime(CLOCK_REALTIME, &tp);
@@ -29,223 +67,200 @@ uint64_t getTimestampMs()
     return sec * 1000 + tp.tv_nsec / 1000000;
 }
 
-void fillTimestamp(UUIDAsArray & uuid, uint64_t timestamp)
+inline void setTimestamp(UUID & uuid, uint64_t timestamp)
 {
-    uuid[0] = (timestamp >> 40) & 0xFF;
-    uuid[1] = (timestamp >> 32) & 0xFF;
-    uuid[2] = (timestamp >> 24) & 0xFF;
-    uuid[3] = (timestamp >> 16) & 0xFF;
-    uuid[4] = (timestamp >> 8) & 0xFF;
-    uuid[5] = (timestamp)&0xFF;
+    UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & rand_a_bits_mask) | ( timestamp << 16) | 0x7000; // version 7
 }
+
+inline void setVariant(UUID & uuid)
+{
+    UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & rand_b_bits_mask) | variant_2_mask;
+}
+
+struct FillAllRandomPolicy
+{
+    static constexpr auto name = "generateUUIDv7NonMonotonic";
+    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), and a random field (74 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit). It is the fastest version of generateUUIDv7* functions family.";
+    struct Data
+    {
+        Data() {}
+
+        void generate(UUID & uuid, uint64_t ts)
+        {
+            setTimestamp(uuid, ts);
+            setVariant(uuid);
+        }
+    };
+};
+
+struct CounterFields
+{
+    uint64_t timestamp = 0;
+    uint64_t counter = 0;
+
+    void resetCounter(const UUID& uuid)
+    {
+        const uint64_t counterLowBits = (UUIDHelpers::getLowBytes(uuid) >> rand_b_low_bits_count) & counter_low_bits_mask;
+        const uint64_t counterHighBits = UUIDHelpers::getHighBytes(uuid) & counter_high_bits_mask;
+        counter = (counterHighBits << 30) | counterLowBits;
+    }
+
+    void incrementCounter(UUID& uuid)
+    {
+        if (++counter == counter_limit) [[unlikely]]
+        {
+            ++timestamp;
+            resetCounter(uuid);
+            setTimestamp(uuid, timestamp);
+            setVariant(uuid);
+        }
+        else
+        {
+            UUIDHelpers::getHighBytes(uuid) = (timestamp << 16) | 0x7000 | (counter >> counter_low_bits_count); 
+            UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & rand_b_with_counter_bits_mask) | variant_2_mask | ((counter & counter_low_bits_mask) << rand_b_low_bits_count);
+        }
+    }
+
+    void generate(UUID& new_uuid, uint64_t unix_time_ms)
+    {
+        const bool need_to_increment_counter = (timestamp == unix_time_ms) || ((timestamp > unix_time_ms) & (timestamp < unix_time_ms + 10000));
+        if (need_to_increment_counter)
+        {
+            incrementCounter(new_uuid);
+        }
+        else
+        {
+            timestamp = unix_time_ms;
+            resetCounter(new_uuid);
+            setTimestamp(new_uuid, timestamp);
+            setVariant(new_uuid);
+        }
+    }
+};
+
+
+struct CounterDataCommon
+{
+    CounterFields& fields;
+    explicit CounterDataCommon(CounterFields & f)
+        : fields(f)
+    {}
+
+    void generate(UUID& uuid, uint64_t ts)
+    {
+        fields.generate(uuid, ts);
+    }
+};
+
+struct ThreadLocalCounter
+{
+    static constexpr auto name = "generateUUIDv7WithFastCounter";
+    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotony at one timestamp is guaraneed only within one thread running generateUUIDv7 function.";
+
+    struct Data : CounterDataCommon
+    {
+        // Implement counter monotony only within one thread so function doesn't require mutexes and doesn't affect performance of the same function running simultenaously on other threads
+        static inline thread_local CounterFields thread_local_fields;
+        Data()
+            : CounterDataCommon(thread_local_fields)
+        {
+        }
+    };
+};
+
+struct GlobalCounter
+{
+    static constexpr auto name = "generateUUIDv7";
+    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotony at one timestamp is guaraneed across all generateUUIDv7 functions running simultaneously.";
+
+    struct Data : CounterDataCommon
+    {
+        // Implement counter monotony within one timestamp across all threads generating UUIDv7 with counter simultaneously
+        static inline CounterFields static_fields;
+        static inline SharedMutex mtx;
+        std::lock_guard<SharedMutex> guard; // SharedMutex works a little bit faster than std::mutex here
+        Data()
+            : CounterDataCommon(static_fields)
+            , guard(mtx)
+        {
+        }
+    };
+};
 }
 
 #define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \
-    DECLARE_DEFAULT_CODE(__VA_ARGS__) \
-    DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__)
+DECLARE_DEFAULT_CODE      (__VA_ARGS__) \
+DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__)
 
 DECLARE_SEVERAL_IMPLEMENTATIONS(
 
-namespace UUIDv7Impl
-{
-    void store(UUID & new_uuid, UUIDAsArray & uuid)
-    {
-        uuid[6] = (uuid[6] & 0x0f) | 0x70; // version 7
-        uuid[8] = (uuid[8] & 0x3f) | 0x80; // variant 2
+template <typename FillPolicy>
+class FunctionGenerateUUIDv7Base : public IFunction,  public FillPolicy {
+public:
+    using FillPolicy::name;
+    using FillPolicyData = typename FillPolicy::Data;
 
-        DB::UUIDHelpers::getHighBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data());
-        DB::UUIDHelpers::getLowBytes(new_uuid) = unalignedLoadBigEndian<uint64_t>(uuid.data() + 8);
+    String getName() const final {  return name; }
+
+    size_t getNumberOfArguments() const final { return 0; }
+    bool isDeterministic() const override { return false; }
+    bool isDeterministicInScopeOfQuery() const final { return false; }
+    bool useDefaultImplementationForNulls() const final { return false; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final { return false; }
+    bool isVariadic() const final { return true; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (arguments.size() > 1)
+            throw Exception(
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
+                getName(),
+                arguments.size());
+
+        return std::make_shared<DataTypeUUID>();
     }
 
-    struct UUIDv7Base
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
     {
-        UUIDAsArray & uuid;
-        explicit UUIDv7Base(UUIDAsArray & u) : uuid(u) { }
-    };
+        auto col_res = ColumnVector<UUID>::create();
+        typename ColumnVector<UUID>::Container & vec_to = col_res->getData();
 
-    struct RandomData
-    {
-        static constexpr auto name = "generateUUIDv7";
-        struct Data : UUIDv7Base
+        size_t size = input_rows_count;
+        if (size)
         {
-            UUIDAsArray uuid_data;
-
-            Data() : UUIDv7Base(uuid_data) { }
-
-            void generate(UUID & new_uuid)
+            vec_to.resize(size);
+          
+            /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl
+            RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
+            auto ts = getTimestampMillisecond();
+            for (UUID & new_uuid : vec_to)
             {
-                fillTimestamp(uuid, getTimestampMs());
-                memcpy(uuid.data() + random_data_offset, &new_uuid, random_data_count);
-                store(new_uuid, uuid);
+                FillPolicyData data;
+                data.generate(new_uuid, ts);
             }
-        };
-    };
-
-    struct CounterDataCommon : UUIDv7Base
-    {
-        explicit CounterDataCommon(UUIDAsArray & u)
-            : UUIDv7Base(u)
-        {}
-
-        uint64_t getCounter()
-        {
-            uint64_t counter = uuid[6] & 0x0f;
-            counter = (counter << 8) | uuid[7];
-            counter = (counter << 6) | (uuid[8] & 0x3f);
-            counter = (counter << 8) | uuid[9];
-            counter = (counter << 8) | uuid[10];
-            counter = (counter << 8) | uuid[11];
-            return counter;
         }
-
-        void generate(UUID & newUUID)
-        {
-            uint64_t timestamp = 0;
-            /// Get timestamp of the previous uuid
-            for (int i = 0; i != 6; ++i)
-                timestamp = (timestamp << 8) | uuid[i];
-
-            const uint64_t unix_time_ms = getTimestampMs();
-            // continue incrementing counter when clock slightly goes back or when counter overflow happened during the previous UUID generation
-            bool need_to_increment_counter = (timestamp == unix_time_ms || timestamp < unix_time_ms + 10000);
-            uint64_t counter = 0;
-            if (need_to_increment_counter)
-                counter = getCounter();
-            else
-                timestamp = unix_time_ms;
-
-            bool counter_incremented = false;
-            if (need_to_increment_counter)
-            {
-                if (++counter == counter_limit)
-                {
-                    ++timestamp;
-                    // counter bytes will be filled by the random data
-                }
-                else
-                {
-                    uuid[6] = counter >> 38;
-                    uuid[7] = counter >> 30;
-                    uuid[8] = counter >> 24;
-                    uuid[9] = counter >> 16;
-                    uuid[10] = counter >> 8;
-                    uuid[11] = counter;
-                    counter_incremented = true;
-                }
-            }
-
-            fillTimestamp(uuid, timestamp);
-
-            // Get the required number of random bytes: 4 in the case of incrementing existing counter, 10 in the case of renewing counter
-            memcpy(
-                uuid.data() + (counter_incremented ? next_count_random_data_offset : random_data_offset),
-                &newUUID,
-                counter_incremented ? next_count_random_data_count : random_data_count);
-
-            store(newUUID, uuid);
-        }
-    };
-
-    struct ThreadLocalCounter
-    {
-        static constexpr auto name = "generateUUIDv7WithFastCounter";
-        struct Data : CounterDataCommon
-        {
-            // Implement counter monotony only within one thread so function doesn't require mutexes and doesn't affect performance of the same function running simultenaously on other threads
-            static inline thread_local UUIDAsArray uuid_data;
-
-            Data()
-                : CounterDataCommon(uuid_data)
-            {}
-        };
-    };
-
-    struct GlobalCounter
-    {
-        static constexpr auto name = "generateUUIDv7WithCounter";
-        struct Data : std::lock_guard<std::mutex>, CounterDataCommon
-        {
-            // Implement counter monotony within one timestamp across all threads generating UUIDv7 with counter simultaneously
-            static inline UUIDAsArray uuid_data;
-            static inline std::mutex mtx;
-
-            Data()
-                : std::lock_guard<std::mutex>(mtx)
-                , CounterDataCommon(uuid_data)
-            {}
-        };
-    };
-}
-
-template <typename FillPolicy>
-class FunctionGenerateUUIDv7Base : public IFunction, public FillPolicy
-{
-  public:
-      using FillPolicy::name;
-      using FillPolicyData = typename FillPolicy::Data;
-
-      FunctionGenerateUUIDv7Base() = default;
-
-      String getName() const final { return name; }
-
-      size_t getNumberOfArguments() const final { return 0; }
-      bool isDeterministic() const override { return false; }
-      bool isDeterministicInScopeOfQuery() const final { return false; }
-      bool useDefaultImplementationForNulls() const final { return false; }
-      bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final { return false; }
-      bool isVariadic() const final { return true; }
-
-      DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
-      {
-          if (arguments.size() > 1)
-              throw Exception(
-                  ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
-                  "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
-                  getName(), arguments.size());
-
-          return std::make_shared<DataTypeUUID>();
-      }
-
-      ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
-      {
-          auto col_res = ColumnVector<UUID>::create();
-          typename ColumnVector<UUID>::Container & vec_to = col_res->getData();
-
-          size_t size = input_rows_count;
-          vec_to.resize(size);
-
-          /// RandImpl is target-dependent and is not the same in different TargetSpecific namespaces.
-          /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl
-          RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
-
-          for (UUID & new_uuid : vec_to)
-          {
-              FillPolicyData data;
-              data.generate(new_uuid);
-          }
-
-          return col_res;
-      }
-  };
-
-using FunctionGenerateUUIDv7 = FunctionGenerateUUIDv7Base<UUIDv7Impl::RandomData>;
-using FunctionGenerateUUIDv7WithCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::GlobalCounter>;
-using FunctionGenerateUUIDv7WithFastCounter = FunctionGenerateUUIDv7Base<UUIDv7Impl::ThreadLocalCounter>;
-
+        return col_res;
+    }
+};
 ) // DECLARE_SEVERAL_IMPLEMENTATIONS
 #undef DECLARE_SEVERAL_IMPLEMENTATIONS
 
-
-class FunctionGenerateUUIDv7 : public TargetSpecific::Default::FunctionGenerateUUIDv7
+template <typename FillPolicy>
+class FunctionGenerateUUIDv7Base : public TargetSpecific::Default::FunctionGenerateUUIDv7Base<FillPolicy>
 {
 public:
-    explicit FunctionGenerateUUIDv7(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionGenerateUUIDv7>();
+    using Self = FunctionGenerateUUIDv7Base<FillPolicy>;
+    using Parent = TargetSpecific::Default::FunctionGenerateUUIDv7Base<FillPolicy>;
 
-#if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionGenerateUUIDv7>();
-#endif
+    explicit FunctionGenerateUUIDv7Base(ContextPtr context) : selector(context)
+    {
+        selector.registerImplementation<TargetArch::Default, Parent>();
+
+    #if USE_MULTITARGET_CODE
+        using ParentAVX2 = TargetSpecific::AVX2::FunctionGenerateUUIDv7Base<FillPolicy>;
+        selector.registerImplementation<TargetArch::AVX2, ParentAVX2>();
+    #endif
     }
 
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
@@ -253,99 +268,35 @@ public:
         return selector.selectAndExecute(arguments, result_type, input_rows_count);
     }
 
-    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionGenerateUUIDv7>(context); }
+    static FunctionPtr create(ContextPtr context)
+    {
+        return std::make_shared<Self>(context);
+    }
 
 private:
     ImplementationSelector<IFunction> selector;
 };
 
-class FunctionGenerateUUIDv7WithCounter : public TargetSpecific::Default::FunctionGenerateUUIDv7WithCounter
-{
-public:
-    explicit FunctionGenerateUUIDv7WithCounter(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionGenerateUUIDv7WithCounter>();
-
-#if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionGenerateUUIDv7WithCounter>();
-#endif
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionGenerateUUIDv7WithCounter>(context); }
-
-private:
-    ImplementationSelector<IFunction> selector;
-};
-
-
-class FunctionGenerateUUIDv7WithFastCounter : public TargetSpecific::Default::FunctionGenerateUUIDv7WithFastCounter
-{
-public:
-    explicit FunctionGenerateUUIDv7WithFastCounter(ContextPtr context) : selector(context)
-    {
-        selector.registerImplementation<TargetArch::Default, TargetSpecific::Default::FunctionGenerateUUIDv7WithFastCounter>();
-
-#if USE_MULTITARGET_CODE
-        selector.registerImplementation<TargetArch::AVX2, TargetSpecific::AVX2::FunctionGenerateUUIDv7WithFastCounter>();
-#endif
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
-    {
-        return selector.selectAndExecute(arguments, result_type, input_rows_count);
-    }
-
-    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionGenerateUUIDv7WithFastCounter>(context); }
-
-private:
-    ImplementationSelector<IFunction> selector;
-};
+template<typename FillPolicy>
+void registerUUIDv7Generator(auto& factory) {
+    static constexpr auto doc_syntax_format = "{}([expression])";
+    static constexpr auto example_format = "SELECT {}()";
+    static constexpr auto multiple_example_format = "SELECT {f}(1), {f}(2)";
 
+    FunctionDocumentation::Description doc_description = FillPolicy::doc_description;
+    FunctionDocumentation::Syntax doc_syntax = fmt::format(doc_syntax_format, FillPolicy::name);
+    FunctionDocumentation::Arguments doc_arguments = {{"expression", "The expression is used to bypass common subexpression elimination if the function is called multiple times in a query but otherwise ignored. Optional."}};
+    FunctionDocumentation::ReturnedValue doc_returned_value = "A value of type UUID version 7.";
+    FunctionDocumentation::Examples doc_examples = {{"uuid", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}};
+    FunctionDocumentation::Categories doc_categories = {"UUID"};
+    factory.template registerFunction<FunctionGenerateUUIDv7Base<FillPolicy>>({doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::CaseInsensitive);
+}
 
 REGISTER_FUNCTION(GenerateUUIDv7)
 {
-    factory.registerFunction<FunctionGenerateUUIDv7>(
-        FunctionDocumentation{
-            .description = R"(
-Generates a UUID of version 7 with current Unix time having milliseconds precision followed by random data.
-This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
-The function returns a value of type UUID.
-)",
-            .examples{{"uuid", "SELECT generateUUIDv7()", ""}, {"multiple", "SELECT generateUUIDv7(1), generateUUIDv7(2)", ""}},
-            .categories{"UUID"}},
-        FunctionFactory::CaseSensitive);
-
-    factory.registerFunction<FunctionGenerateUUIDv7WithCounter>(
-        FunctionDocumentation{
-            .description = R"(
-Generates a UUID of version 7 with current Unix time having milliseconds precision, a monotonic counter within the same timestamp starting from the random value, and followed by 4 random bytes.
-This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
-The function returns a value of type UUID.
-)",
-            .examples{
-                {"uuid", "SELECT generateUUIDv7WithCounter()", ""},
-                {"multiple", "SELECT generateUUIDv7WithCounter(1), generateUUIDv7WithCounter(2)", ""}},
-            .categories{"UUID"}},
-        FunctionFactory::CaseSensitive);
-
-    factory.registerFunction<FunctionGenerateUUIDv7WithFastCounter>(
-        FunctionDocumentation{
-            .description = R"(
-Generates a UUID of version 7 with current Unix time having milliseconds precision, a monotonic counter within the same timestamp and the same request starting from the random value, and followed by 4 random bytes.
-This function takes an optional argument, the value of which is discarded to generate different values in case the function is called multiple times.
-This function is a little bit faster version of the function GenerateUUIDv7WithCounter. It doesn't guarantee the counter monotony within the same timestamp across different requests.
-The function returns a value of type UUID.
-)",
-            .examples{
-                {"uuid", "SELECT generateUUIDv7WithFastCounter()", ""},
-                {"multiple", "SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(2)", ""}},
-            .categories{"UUID"}},
-        FunctionFactory::CaseSensitive);
+    registerUUIDv7Generator<GlobalCounter>(factory);
+    registerUUIDv7Generator<FillAllRandomPolicy>(factory);
+    registerUUIDv7Generator<ThreadLocalCounter>(factory);
 }
 
 }
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference
deleted file mode 100644
index b6d3cdbe300..00000000000
--- a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.reference
+++ /dev/null
@@ -1,9 +0,0 @@
-0
-0
-1
-0
-0
-1
-0
-0
-1
diff --git a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql b/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql
deleted file mode 100644
index 39e62185099..00000000000
--- a/tests/queries/0_stateless/02310_generate_multi_columns_with_uuid_v7.sql
+++ /dev/null
@@ -1,17 +0,0 @@
-SELECT generateUUIDv7(1) = generateUUIDv7(2);
-
-SELECT generateUUIDv7() = generateUUIDv7(1);
-
-SELECT generateUUIDv7(1) = generateUUIDv7(1);
-
-SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(2);
-
-SELECT generateUUIDv7WithCounter() = generateUUIDv7WithCounter(1);
-
-SELECT generateUUIDv7WithCounter(1) = generateUUIDv7WithCounter(1);
-
-SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(2);
-
-SELECT generateUUIDv7WithFastCounter() = generateUUIDv7WithFastCounter(1);
-
-SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(1);
diff --git a/tests/queries/0_stateless/02310_uuid_v7.reference b/tests/queries/0_stateless/02310_uuid_v7.reference
new file mode 100644
index 00000000000..149a9018d4e
--- /dev/null
+++ b/tests/queries/0_stateless/02310_uuid_v7.reference
@@ -0,0 +1,21 @@
+-- generateUUIDv7 --
+UUID
+7
+2
+0
+0
+1
+-- generateUUIDv7WithFastCounter --
+UUID
+7
+2
+0
+0
+1
+-- generateUUIDv7NonMonotonic --
+UUID
+7
+2
+0
+0
+1
diff --git a/tests/queries/0_stateless/02310_uuid_v7.sql b/tests/queries/0_stateless/02310_uuid_v7.sql
new file mode 100644
index 00000000000..6f80cfbd921
--- /dev/null
+++ b/tests/queries/0_stateless/02310_uuid_v7.sql
@@ -0,0 +1,23 @@
+SELECT '-- generateUUIDv7 --';
+SELECT toTypeName(generateUUIDv7());
+SELECT substring(hex(generateUUIDv7()), 13, 1); -- extract version
+SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7()), 62), 3); -- extract variant
+SELECT generateUUIDv7(1) = generateUUIDv7(2);
+SELECT generateUUIDv7() = generateUUIDv7(1);
+SELECT generateUUIDv7(1) = generateUUIDv7(1);
+
+SELECT '-- generateUUIDv7WithFastCounter --';
+SELECT toTypeName(generateUUIDv7WithFastCounter());
+SELECT substring(hex(generateUUIDv7WithFastCounter()), 13, 1); -- extract version
+SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7WithFastCounter()), 62), 3); -- extract variant
+SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(2);
+SELECT generateUUIDv7WithFastCounter() = generateUUIDv7WithFastCounter(1);
+SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(1);
+
+SELECT '-- generateUUIDv7NonMonotonic --';
+SELECT toTypeName(generateUUIDv7NonMonotonic());
+SELECT substring(hex(generateUUIDv7NonMonotonic()), 13, 1); -- extract version
+SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7NonMonotonic()), 62), 3); -- extract variant
+SELECT generateUUIDv7NonMonotonic(1) = generateUUIDv7NonMonotonic(2);
+SELECT generateUUIDv7NonMonotonic() = generateUUIDv7NonMonotonic(1);
+SELECT generateUUIDv7NonMonotonic(1) = generateUUIDv7NonMonotonic(1);

From 37a9a2c395cfbc75c0c91b28a77d85b2c0ead97a Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Sat, 27 Apr 2024 18:01:47 +0300
Subject: [PATCH 061/192] Update aspell-dict.txt

---
 utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index 811bf3f8e9c..9f01e50e066 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -616,6 +616,7 @@ NetworkSendErrors
 NetworkSendPackets
 Noaa
 NodeJs
+NonMonotonic
 NuRaft
 NumHexagons
 NumPy
@@ -1972,6 +1973,7 @@ modularization
 moduloOrZero
 moduli
 mongodb
+monotonicity
 monthName
 moscow
 msgpack
@@ -2751,6 +2753,7 @@ userspace
 userver
 utils
 uuid
+uuidv
 varPop
 varPopStable
 varSamp

From 113ad9b8ba4545794cba619e9550e6ec55fe4b82 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Sat, 27 Apr 2024 18:16:21 +0300
Subject: [PATCH 062/192] Update generateUUIDv7.cpp

---
 src/Functions/generateUUIDv7.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 0b5e1827cc5..cae13ffb504 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -116,7 +116,7 @@ struct CounterFields
         }
         else
         {
-            UUIDHelpers::getHighBytes(uuid) = (timestamp << 16) | 0x7000 | (counter >> counter_low_bits_count); 
+            UUIDHelpers::getHighBytes(uuid) = (timestamp << 16) | 0x7000 | (counter >> counter_low_bits_count);
             UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & rand_b_with_counter_bits_mask) | variant_2_mask | ((counter & counter_low_bits_mask) << rand_b_low_bits_count);
         }
     }

From e351b51ca3c632762b6cfe439a1ce93f1b159292 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Sat, 27 Apr 2024 18:42:08 +0300
Subject: [PATCH 063/192] Update generateUUIDv7.cpp

---
 src/Functions/generateUUIDv7.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index cae13ffb504..86bcdae93e7 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -195,7 +195,8 @@ DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__)
 DECLARE_SEVERAL_IMPLEMENTATIONS(
 
 template <typename FillPolicy>
-class FunctionGenerateUUIDv7Base : public IFunction,  public FillPolicy {
+class FunctionGenerateUUIDv7Base : public IFunction, public FillPolicy
+{
 public:
     using FillPolicy::name;
     using FillPolicyData = typename FillPolicy::Data;

From 1c5c97f648220b3e3e3e61245ebbe0564e19e74d Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Sat, 27 Apr 2024 18:50:41 +0300
Subject: [PATCH 064/192] Update generateUUIDv7.cpp

---
 src/Functions/generateUUIDv7.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 86bcdae93e7..c2cad2496b2 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -279,7 +279,8 @@ private:
 };
 
 template<typename FillPolicy>
-void registerUUIDv7Generator(auto& factory) {
+void registerUUIDv7Generator(auto& factory)
+{
     static constexpr auto doc_syntax_format = "{}([expression])";
     static constexpr auto example_format = "SELECT {}()";
     static constexpr auto multiple_example_format = "SELECT {f}(1), {f}(2)";

From 6ef5df984557be0b5dc97182aeddf7e3e4fc2e2c Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Sat, 27 Apr 2024 19:43:08 +0300
Subject: [PATCH 065/192] Update generateUUIDv7.cpp

---
 src/Functions/generateUUIDv7.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index c2cad2496b2..a1705b6dd7a 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -231,7 +231,7 @@ public:
         if (size)
         {
             vec_to.resize(size);
-          
+
             /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl
             RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
             auto ts = getTimestampMillisecond();

From 74a9b713a84def36430b3764379d9ff88cd60f62 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Sat, 27 Apr 2024 21:47:14 +0300
Subject: [PATCH 066/192] Update generateUUIDv7.cpp

---
 src/Functions/generateUUIDv7.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index a1705b6dd7a..7f3a97cb0a6 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -83,8 +83,6 @@ struct FillAllRandomPolicy
     static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), and a random field (74 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit). It is the fastest version of generateUUIDv7* functions family.";
     struct Data
     {
-        Data() {}
-
         void generate(UUID & uuid, uint64_t ts)
         {
             setTimestamp(uuid, ts);

From 2692784585650311c78ae6c5f22dbdfa48be9254 Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 29 Feb 2024 19:53:23 +0800
Subject: [PATCH 067/192] support inequal join for left/right/inner all

---
 src/Columns/IColumn.h           |   1 +
 src/Core/Block.cpp              |  14 +-
 src/Core/Block.h                |   4 +-
 src/Interpreters/HashJoin.cpp   | 314 +++++++++++++++++++++++++++++++-
 src/Interpreters/HashJoin.h     |   3 +
 src/Interpreters/RowRefs.h      |   7 +-
 src/Interpreters/TableJoin.h    |   8 +
 src/Planner/PlannerJoinTree.cpp |   9 +
 src/Planner/PlannerJoins.cpp    | 145 +++++++++++----
 src/Planner/PlannerJoins.h      |  29 +++
 10 files changed, 484 insertions(+), 50 deletions(-)

diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h
index cea8d7c9f55..20d76e8edd0 100644
--- a/src/Columns/IColumn.h
+++ b/src/Columns/IColumn.h
@@ -2,6 +2,7 @@
 
 #include <Common/COW.h>
 #include <Common/PODArray_fwd.h>
+#include <Common/PODArray.h>
 #include <Common/Exception.h>
 #include <Common/typeid_cast.h>
 #include <base/StringRef.h>
diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp
index 77dbad5443e..34984f122a8 100644
--- a/src/Core/Block.cpp
+++ b/src/Core/Block.cpp
@@ -14,6 +14,9 @@
 
 #include <boost/algorithm/string.hpp>
 
+#include <Interpreters/Context.h>
+#include <Processors/Formats/IOutputFormat.h>
+
 
 namespace DB
 {
@@ -428,6 +431,16 @@ std::string Block::dumpIndex() const
     return out.str();
 }
 
+std::string Block::dumpContent() const
+{
+WriteBufferFromOwnString buf;
+   auto output_format = Context::getGlobalContextInstance()->getOutputFormat("PrettyCompact", buf, *this);
+   output_format->write(materializeBlock(*this));
+   output_format->flush();
+   buf.finalize();
+   return buf.str();
+}
+
 Block Block::cloneEmpty() const
 {
     Block res;
@@ -853,5 +866,4 @@ Block concatenateBlocks(const std::vector<Block> & blocks)
     out.setColumns(std::move(columns));
     return out;
 }
-
 }
diff --git a/src/Core/Block.h b/src/Core/Block.h
index c8bebb4552a..fb12e64fcb7 100644
--- a/src/Core/Block.h
+++ b/src/Core/Block.h
@@ -126,6 +126,9 @@ public:
     /** List of column names and positions from index */
     std::string dumpIndex() const;
 
+    /// Print all the values in this block.
+    std::string dumpContent() const;
+
     /** Get the same block, but empty. */
     Block cloneEmpty() const;
 
@@ -210,5 +213,4 @@ Block materializeBlock(const Block & block);
 void materializeBlockInplace(Block & block);
 
 Block concatenateBlocks(const std::vector<Block> & blocks);
-
 }
diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index 73498b39ead..dc2ddd5a365 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -1,6 +1,7 @@
 #include <any>
 #include <limits>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include <Columns/ColumnConst.h>
@@ -16,6 +17,7 @@
 
 
 #include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeLowCardinality.h>
 #include <DataTypes/DataTypeTuple.h>
 
@@ -32,6 +34,8 @@
 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
 #include <Common/formatReadable.h>
+#include "ExpressionActions.h"
+#include "RowRefs.h"
 
 #include <Functions/FunctionHelpers.h>
 #include <Interpreters/castColumn.h>
@@ -251,6 +255,8 @@ HashJoin::HashJoin(std::shared_ptr<TableJoin> table_join_, const Block & right_s
     LOG_TRACE(log, "{}Keys: {}, datatype: {}, kind: {}, strictness: {}, right header: {}",
         instance_log_id, TableJoin::formatClauses(table_join->getClauses(), true), data->type, kind, strictness, right_sample_block.dumpStructure());
 
+    validateAdditionalFilterExpression(table_join->getFullJoinExpression());
+
     if (isCrossOrComma(kind))
     {
         data->type = Type::CROSS;
@@ -1043,14 +1049,17 @@ public:
     };
 
     AddedColumns(
-        const Block & left_block,
+        const Block & left_block_,
         const Block & block_with_columns_to_add,
         const Block & saved_block_sample,
         const HashJoin & join,
         std::vector<JoinOnKeyColumns> && join_on_keys_,
+        ExpressionActionsPtr additional_filter_expression_,
         bool is_asof_join,
         bool is_join_get_)
-        : join_on_keys(join_on_keys_)
+        : left_block(left_block_)
+        , join_on_keys(join_on_keys_)
+        , additional_filter_expression(additional_filter_expression_)
         , rows_to_add(left_block.rows())
         , is_join_get(is_join_get_)
     {
@@ -1119,7 +1128,9 @@ public:
 
     const IColumn & leftAsofKey() const { return *left_asof_key; }
 
+    Block left_block;
     std::vector<JoinOnKeyColumns> join_on_keys;
+    ExpressionActionsPtr additional_filter_expression;
 
     size_t max_joined_block_rows = 0;
     size_t rows_to_add;
@@ -1220,7 +1231,7 @@ void AddedColumns<true>::buildOutput()
         {
             if (!lazy_output.blocks[j])
             {
-                default_count ++;
+                default_count++;
                 continue;
             }
             apply_default();
@@ -1481,6 +1492,240 @@ void setUsed(IColumn::Filter & filter [[maybe_unused]], size_t pos [[maybe_unuse
         filter[pos] = 1;
 }
 
+template<typename AddedColumns>
+ColumnPtr buildAdditionFilter(
+    const std::vector<std::pair<const Block *, size_t>> & selected_rows,
+    const std::vector<size_t> & row_replicate_offset,
+    AddedColumns & added_columns)
+{
+    if (selected_rows.empty())
+        return ColumnUInt8::create();
+    const Block & sample_right_block = *selected_rows.begin()->first;
+    if (!sample_right_block)
+        return ColumnUInt8::create();
+
+    auto required_cols = added_columns.additional_filter_expression->getRequiredColumnsWithTypes();
+    NameSet required_column_names;
+    for (auto & col : required_cols)
+    {
+        required_column_names.insert(col.name);
+    }
+    Block executed_block;
+    size_t right_col_pos = 0;
+    for (const auto & col : sample_right_block.getColumnsWithTypeAndName())
+    {
+        if (required_column_names.contains(col.name))
+        {
+            auto new_col = col.column->cloneEmpty();
+            for (const auto & selected_row : selected_rows)
+            {
+                const auto & src_col = selected_row.first->getByPosition(right_col_pos);
+                new_col->insertFrom(*src_col.column, selected_row.second);
+            }
+            executed_block.insert({std::move(new_col), col.type, col.name});
+        }
+        right_col_pos += 1;
+    }
+    if (!executed_block)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected at least one column from right table");
+    }
+
+    for (const auto & col_name : required_column_names)
+    {
+        const auto * src_col = added_columns.left_block.findByName(col_name);
+        if (!src_col)
+            continue;
+        auto new_col = src_col->column->cloneEmpty();
+        size_t prev_left_offset = 0;
+        for (size_t i = 1; i < row_replicate_offset.size(); ++i)
+        {
+            const size_t & left_offset = row_replicate_offset[i];
+            size_t rows = left_offset - prev_left_offset;
+            if (rows)
+            {
+                new_col->insertManyFrom(*src_col->column, i - 1, rows);
+            }
+            prev_left_offset = left_offset;
+        }
+        executed_block.insert({std::move(new_col), src_col->type, col_name});
+    }
+    // LOG_TRACE(getLogger("HashJoin"), "Additional filter execute block:\n{}", executed_block.dumpContent());
+    added_columns.additional_filter_expression->execute(executed_block);
+    // LOG_TRACE(getLogger("HashJoin"), "Addition filter execute result block:\n{}", executed_block.dumpContent());
+    return executed_block.getByPosition(0).column;
+}
+
+template <bool multiple_disjuncts, bool need_flags>
+void appendFoundRowAll(
+    const RowRefList & row_list,
+    std::vector<std::pair<const Block *, size_t>> & selected_rows,
+    IColumn::Offset & current_offset,
+    KnownRowsHolder<multiple_disjuncts> & known_rows [[maybe_unused]],
+    JoinStuff::JoinUsedFlags * used_flags [[maybe_unused]])
+{
+    if constexpr (multiple_disjuncts)
+    {
+        std::unique_ptr<std::vector<KnownRowsHolder<true>::Type>> new_known_rows_ptr;
+        for (auto it = row_list.begin(); it.ok(); ++it)
+        {
+            auto row_ref = std::make_pair(it->block, it->row_num);
+            if (!known_rows.isKnown(row_ref))
+            {
+
+                selected_rows.push_back(row_ref);
+                ++current_offset;
+                if (!new_known_rows_ptr)
+                {
+                    new_known_rows_ptr = std::make_unique<std::vector<KnownRowsHolder<true>::Type>>();
+                }
+                new_known_rows_ptr->push_back(row_ref);
+                if constexpr (need_flags)
+                {
+                    used_flags->JoinStuff::JoinUsedFlags::setUsedOnce<true, multiple_disjuncts>(
+                        FindResultImpl<const RowRef, false>(*it, true, 0));
+                }
+            }
+        }
+
+        if (new_known_rows_ptr)
+            known_rows.add(std::cbegin(*new_known_rows_ptr), std::cend(*new_known_rows_ptr));
+    }
+    else
+    {
+        for (auto it = row_list.begin(); it.ok(); ++it)
+        {
+            selected_rows.emplace_back(std::pair(it->block, it->row_num));
+            ++current_offset;
+        }
+    }
+}
+
+template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool multiple_disjuncts, typename AddedColumns>
+NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
+    std::vector<KeyGetter> && key_getter_vector,
+    const std::vector<const Map *> & mapv,
+    AddedColumns & added_columns,
+    JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]])
+{
+    constexpr JoinFeatures<KIND, STRICTNESS> join_features;
+
+    size_t rows = added_columns.rows_to_add;
+    if constexpr (need_filter)
+        added_columns.filter = IColumn::Filter(rows, 0);
+
+    Arena pool;
+
+    if constexpr (join_features.need_replication)
+        added_columns.offsets_to_replicate = std::make_unique<IColumn::Offsets>(rows);
+
+    std::vector<size_t> row_replicate_offset;
+    row_replicate_offset.reserve(rows);
+    row_replicate_offset.push_back(0);
+    
+    using FindResult = typename KeyGetter::FindResult;
+    IColumn::Offset current_offset = 0;
+    size_t max_joined_block_rows = added_columns.max_joined_block_rows;
+    size_t i = 0;
+    std::vector<std::pair<const Block*, size_t>> selected_rows;
+    selected_rows.reserve(rows);
+    std::vector<FindResult> find_results;
+    /// First, collect matched row refs.
+    for (; i < rows; ++i)
+    {
+        if constexpr (join_features.need_replication)
+        {
+            if (unlikely(current_offset >= max_joined_block_rows))
+            {
+                break;
+            }
+        }
+        KnownRowsHolder<multiple_disjuncts> known_rows;
+        for (size_t onexpr_idx = 0; onexpr_idx < added_columns.join_on_keys.size(); ++onexpr_idx)
+        {
+            const auto & join_keys = added_columns.join_on_keys[onexpr_idx];
+            if (join_keys.null_map && (*join_keys.null_map)[i])
+                continue;
+            
+            bool row_acceptable = !join_keys.isRowFiltered(i);
+            auto find_result = row_acceptable ? key_getter_vector[onexpr_idx].findKey(*(mapv[onexpr_idx]), i, pool) : FindResult();
+
+            if (find_result.isFound())
+            {
+                auto & mapped = find_result.getMapped();
+                find_results.push_back(find_result);
+                if constexpr (join_features.is_all_join)
+                {
+                    appendFoundRowAll<multiple_disjuncts, join_features.need_flags>(mapped, selected_rows, current_offset, known_rows, &used_flags);
+                }
+                else
+                {
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported join type. kind:{}, strictness:{}", KIND, STRICTNESS);
+                }
+            }
+        }
+        row_replicate_offset.push_back(current_offset);
+    }
+
+    /// Second. filtout rows which is not true in additional filter expression.
+    size_t prev_offset = 0;
+    const PaddedPODArray<UInt8> * filter_flags = nullptr;
+    auto filter = buildAdditionFilter(selected_rows, row_replicate_offset, added_columns);
+    if (filter->isNullable())
+    {
+        auto nested_col = typeid_cast<const ColumnNullable &>(*filter).getNestedColumnPtr();
+        filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*nested_col).getData());
+    }
+    else
+    {
+        filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*filter).getData());
+    }
+
+    current_offset = 0;
+    auto row_it = selected_rows.begin();
+    for (size_t j = 1; j < row_replicate_offset.size(); ++j)
+    {
+        bool any_matched = false;
+        for (size_t k = prev_offset; k < row_replicate_offset[j]; ++k)
+        {
+            if ((*filter_flags)[k])
+            {
+                any_matched = true;
+                added_columns.appendFromBlock(*row_it->first, row_it->second, join_features.add_missing);
+                current_offset += 1;
+            }
+            ++row_it;
+        }
+        if (!any_matched)
+        {
+            if constexpr (join_features.is_anti_join && join_features.left)
+                setUsed<need_filter>(added_columns.filter, j - 1);
+            addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, current_offset);
+        }
+        else
+        {
+            if constexpr (join_features.is_all_join)
+            {
+                used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_results[j - 1]);
+                setUsed<need_filter>(added_columns.filter, j - 1);
+            }
+        }
+
+        if constexpr (join_features.need_replication)
+        {
+            (*added_columns.offsets_to_replicate)[j-1] = current_offset;
+        }
+        prev_offset = row_replicate_offset[j];
+    }
+    if constexpr (join_features.need_replication)
+    {
+        added_columns.offsets_to_replicate->resize_assume_reserved(i);
+        added_columns.filter.resize_assume_reserved(i);
+    }
+    added_columns.applyLazyDefaults();
+    return i;
+}
+
 /// Joins right table columns which indexes are present in right_indexes using specified map.
 /// Makes filter (1 if row presented in right table) and returns offsets to replicate (for ALL JOINS).
 template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool multiple_disjuncts, typename AddedColumns>
@@ -1497,7 +1742,7 @@ NO_INLINE size_t joinRightColumns(
         added_columns.filter = IColumn::Filter(rows, 0);
 
     Arena pool;
-
+    
     if constexpr (join_features.need_replication)
         added_columns.offsets_to_replicate = std::make_unique<IColumn::Offsets>(rows);
 
@@ -1629,9 +1874,20 @@ size_t joinRightColumnsSwitchMultipleDisjuncts(
     AddedColumns & added_columns,
     JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]])
 {
-    return mapv.size() > 1
-        ? joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
-        : joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
+    if (added_columns.additional_filter_expression)
+    {
+        return mapv.size() > 1 ? joinRightColumnsWithAddtitionalFilter<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(
+                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
+                               : joinRightColumnsWithAddtitionalFilter<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(
+                                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
+    }
+    else
+    {
+        return mapv.size() > 1 ? joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(
+                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
+                               : joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(
+                                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
+    }
 }
 
 template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, typename AddedColumns>
@@ -1787,8 +2043,14 @@ Block HashJoin::joinBlockImpl(
       * For ASOF, the last column is used as the ASOF column
       */
     AddedColumns<!join_features.is_any_join> added_columns(
-        block, block_with_columns_to_add, savedBlockSample(), *this, std::move(join_on_keys), join_features.is_asof_join, is_join_get);
-
+        block,
+        block_with_columns_to_add,
+        savedBlockSample(),
+        *this,
+        std::move(join_on_keys),
+        table_join->getFullJoinExpression(),
+        join_features.is_asof_join,
+        is_join_get);
 
     bool has_required_right_keys = (required_right_keys.columns() != 0);
     added_columns.need_filter = join_features.need_filter || has_required_right_keys;
@@ -1855,11 +2117,15 @@ Block HashJoin::joinBlockImpl(
 
         /// If ALL ... JOIN - we replicate all the columns except the new ones.
         for (size_t i = 0; i < existing_columns; ++i)
+        {
             block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->replicate(*offsets_to_replicate);
+        }
 
         /// Replicate additional right keys
         for (size_t pos : right_keys_to_replicate)
+        {
             block.safeGetByPosition(pos).column = block.safeGetByPosition(pos).column->replicate(*offsets_to_replicate);
+        }
     }
 
     return remaining_block;
@@ -2393,4 +2659,34 @@ const ColumnWithTypeAndName & HashJoin::rightAsofKeyColumn() const
     return savedBlockSample().getByName(table_join->getOnlyClause().key_names_right.back());
 }
 
+void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additional_filter_expression)
+{
+    if (!additional_filter_expression)
+        return;
+    Block expression_sample_block = additional_filter_expression->getSampleBlock();
+
+    if (expression_sample_block.columns() != 1)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+            "Unexpected expression in JOIN ON section. Expected single column, got '{}'",
+            expression_sample_block.dumpStructure());
+    }
+
+    auto type = removeNullable(expression_sample_block.getByPosition(0).type);
+    if (!type->equals(*std::make_shared<DataTypeUInt8>()))
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+            "Unexpected expression in JOIN ON section. Expected boolean (UInt8), got '{}'",
+            expression_sample_block.getByPosition(0).type->getName());
+    }
+
+    bool is_supported = (strictness == JoinStrictness::All) && (kind == JoinKind::Inner || kind == JoinKind::Left || kind == JoinKind::Right);
+    if (!is_supported)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+            "Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/RIGHT JOINs",
+            expression_sample_block.getByPosition(0).name);
+    }
+}
+
 }
diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h
index 3e11517edad..2e5c7c12323 100644
--- a/src/Interpreters/HashJoin.h
+++ b/src/Interpreters/HashJoin.h
@@ -9,6 +9,7 @@
 
 #include <Interpreters/IJoin.h>
 #include <Interpreters/AggregationCommon.h>
+#include <Interpreters/ExpressionActions.h>
 #include <Interpreters/RowRefs.h>
 
 #include <Common/Arena.h>
@@ -470,6 +471,8 @@ private:
     static Type chooseMethod(JoinKind kind, const ColumnRawPtrs & key_columns, Sizes & key_sizes);
 
     bool empty() const;
+
+    void validateAdditionalFilterExpression(ExpressionActionsPtr additional_filter_expression);
 };
 
 }
diff --git a/src/Interpreters/RowRefs.h b/src/Interpreters/RowRefs.h
index 650b2311ba7..b391b77c0b7 100644
--- a/src/Interpreters/RowRefs.h
+++ b/src/Interpreters/RowRefs.h
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <list>
 #include <mutex>
 #include <optional>
@@ -122,7 +123,7 @@ struct RowRefList : RowRef
     };
 
     RowRefList() {} /// NOLINT
-    RowRefList(const Block * block_, size_t row_num_) : RowRef(block_, row_num_) {}
+    RowRefList(const Block * block_, size_t row_num_) : RowRef(block_, row_num_), rows(1) {}
 
     ForwardIterator begin() const { return ForwardIterator(this); }
 
@@ -135,10 +136,14 @@ struct RowRefList : RowRef
             *next = Batch(nullptr);
         }
         next = next->insert(std::move(row_ref), pool);
+        rows += 1;
     }
 
+    size_t toalRows() const { return rows; }
+
 private:
     Batch * next = nullptr;
+    size_t rows = 0;
 };
 
 /**
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 75e2342d1e9..6a449f967fc 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -11,6 +11,8 @@
 #include <Interpreters/IKeyValueEntity.h>
 
 #include <Common/Exception.h>
+#include "ActionsDAG.h"
+#include "ExpressionActions.h"
 #include <Parsers/IAST_fwd.h>
 
 #include <cstddef>
@@ -153,6 +155,8 @@ private:
     ASTs key_asts_right;
 
     Clauses clauses;
+    /// Originally used for inequal join. If there is no any inequal join condition, it will be nullptr.
+    ExpressionActionsPtr full_join_expression = nullptr;
 
     ASTTableJoin table_join;
 
@@ -298,6 +302,10 @@ public:
     std::vector<JoinOnClause> & getClauses() { return clauses; }
     const std::vector<JoinOnClause> & getClauses() const { return clauses; }
 
+    const ExpressionActionsPtr & getFullJoinExpression() const { return full_join_expression;}
+    ExpressionActionsPtr & getFullJoinExpression() { return full_join_expression;}
+
+
     Names getAllNames(JoinTableSide side) const;
 
     void resetCollected();
diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp
index ba4f9718107..9c58511f070 100644
--- a/src/Planner/PlannerJoinTree.cpp
+++ b/src/Planner/PlannerJoinTree.cpp
@@ -1,3 +1,4 @@
+#include <unordered_map>
 #include <Planner/PlannerJoinTree.h>
 
 #include <Common/scope_guard_safe.h>
@@ -1317,6 +1318,14 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_
                 std::swap(table_join_clause.key_names_right.at(asof_condition.key_index), table_join_clause.key_names_right.back());
             }
         }
+
+        if (join_clauses_and_actions.full_join_expressions_actions)
+        {
+            ExpressionActionsPtr & full_join_expression = table_join->getFullJoinExpression();
+            full_join_expression = std::make_shared<ExpressionActions>(
+                join_clauses_and_actions.full_join_expressions_actions,
+                ExpressionActionsSettings::fromContext(planner_context->getQueryContext(), CompileExpressions::no));
+        }
     }
     else if (join_node.isUsingJoinExpression())
     {
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 78b0e966ca4..44a56827450 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -1,3 +1,4 @@
+#include <memory>
 #include <Planner/PlannerJoins.h>
 
 #include <boost/algorithm/string/split.hpp>
@@ -41,6 +42,7 @@
 #include <Planner/PlannerActionsVisitor.h>
 #include <Planner/PlannerContext.h>
 #include <Planner/Utils.h>
+#include "PlannerActionsVisitor.h"
 
 namespace DB
 {
@@ -125,13 +127,13 @@ TableExpressionSet extractTableExpressionsSet(const QueryTreeNodePtr & node)
     return res;
 }
 
-std::optional<JoinTableSide> extractJoinTableSideFromExpression(
+std::set<JoinTableSide> extractJoinTableSidesFromExpression(//const ActionsDAG::Node * expression_root_node,
     const IQueryTreeNode * expression_root_node,
     const TableExpressionSet & left_table_expressions,
     const TableExpressionSet & right_table_expressions,
     const JoinNode & join_node)
 {
-    std::optional<JoinTableSide> table_side;
+    std::set<JoinTableSide> table_sides;
     std::vector<const IQueryTreeNode *> nodes_to_process;
     nodes_to_process.push_back(expression_root_node);
 
@@ -169,15 +171,10 @@ std::optional<JoinTableSide> extractJoinTableSideFromExpression(
                 join_node.getRightTableExpression()->formatASTForErrorMessage());
 
         auto input_table_side = is_column_from_left_expr ? JoinTableSide::Left : JoinTableSide::Right;
-        if (table_side && (*table_side) != input_table_side)
-            throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
-                "JOIN {} join expression contains column from left and right table",
-                join_node.formatASTForErrorMessage());
-
-        table_side = input_table_side;
+        table_sides.insert(input_table_side);
     }
 
-    return table_side;
+    return table_sides;
 }
 
 const ActionsDAG::Node * appendExpression(
@@ -199,6 +196,7 @@ const ActionsDAG::Node * appendExpression(
 void buildJoinClause(
     ActionsDAGPtr & left_dag,
     ActionsDAGPtr & right_dag,
+    ActionsDAGPtr & mixed_dag,
     const PlannerContextPtr & planner_context,
     const QueryTreeNodePtr & join_expression,
     const TableExpressionSet & left_table_expressions,
@@ -219,6 +217,7 @@ void buildJoinClause(
             buildJoinClause(
                 left_dag,
                 right_dag,
+                mixed_dag,
                 planner_context,
                 child,
                 left_table_expressions,
@@ -235,41 +234,52 @@ void buildJoinClause(
 
     if (function_name == "equals" || function_name == "isNotDistinctFrom" || is_asof_join_inequality)
     {
+        if (function_node->getArguments().getNodes().size() != 2)
+        {
+            throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
+                "JOIN {} ON expression expected two arguments",
+                join_node.formatASTForErrorMessage());
+        }
+
         const auto left_child = function_node->getArguments().getNodes().at(0);
         const auto right_child = function_node->getArguments().getNodes().at(1);
 
-        auto left_expression_side_optional = extractJoinTableSideFromExpression(left_child.get(),
+        auto left_expression_sides = extractJoinTableSidesFromExpression(left_child.get(),
             left_table_expressions,
             right_table_expressions,
             join_node);
 
-        auto right_expression_side_optional = extractJoinTableSideFromExpression(right_child.get(),
+        auto right_expression_sides = extractJoinTableSidesFromExpression(right_child.get(),
             left_table_expressions,
             right_table_expressions,
             join_node);
-
-        if (!left_expression_side_optional && !right_expression_side_optional)
+        
+        if (left_expression_sides.empty() && right_expression_sides.empty())
         {
             throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
-                "JOIN {} ON expression with constants is not supported",
+                "JOIN {} ON expression expected non-empty left and right table expressions",
                 join_node.formatASTForErrorMessage());
         }
-        else if (left_expression_side_optional && !right_expression_side_optional)
+        else if (left_expression_sides.size() == 1 && right_expression_sides.empty())
         {
-            auto & dag = *left_expression_side_optional == JoinTableSide::Left ? left_dag : right_dag;
+            auto expression_side = *left_expression_sides.begin();
+            auto & dag =  expression_side == JoinTableSide::Left ? left_dag : right_dag;
             const auto * node = appendExpression(dag, join_expression, planner_context, join_node);
-            join_clause.addCondition(*left_expression_side_optional, node);
+            join_clause.addCondition(expression_side, node);
+
         }
-        else if (!left_expression_side_optional && right_expression_side_optional)
+        else if (left_expression_sides.empty() && right_expression_sides.size() == 1)
         {
-            auto & dag = *right_expression_side_optional == JoinTableSide::Left ? left_dag : right_dag;
+            auto expression_side = *right_expression_sides.begin();
+            auto & dag =  expression_side == JoinTableSide::Left ? left_dag : right_dag;
             const auto * node = appendExpression(dag, join_expression, planner_context, join_node);
-            join_clause.addCondition(*right_expression_side_optional, node);
+            join_clause.addCondition(expression_side, node);
+
         }
-        else
+        else if (left_expression_sides.size() == 1 && right_expression_sides.size() == 1)
         {
-            auto left_expression_side = *left_expression_side_optional;
-            auto right_expression_side = *right_expression_side_optional;
+            auto left_expression_side = *left_expression_sides.begin();
+            auto right_expression_side = *right_expression_sides.begin();
 
             if (left_expression_side != right_expression_side)
             {
@@ -310,23 +320,36 @@ void buildJoinClause(
                 join_clause.addCondition(left_expression_side, node);
             }
         }
+        else
+        {
+            /// expression involves both tables.
+            /// `expr1(left.col1, right.col2) == expr2(left.col3, right.col4)`
+            const auto * node = appendExpression(mixed_dag, join_expression, planner_context, join_node);
+            join_clause.addMixedCondition(node);
+        }
 
-        return;
     }
+    else
+    {
+        auto expression_sides = extractJoinTableSidesFromExpression(join_expression.get(),
+            left_table_expressions,
+            right_table_expressions,
+            join_node);
 
-    auto expression_side_optional = extractJoinTableSideFromExpression(
-        join_expression.get(),
-        left_table_expressions,
-        right_table_expressions,
-        join_node);
-
-    if (!expression_side_optional)
-        expression_side_optional = JoinTableSide::Right;
-
-    auto expression_side = *expression_side_optional;
-    auto & dag = expression_side == JoinTableSide::Left ? left_dag : right_dag;
-    const auto * node = appendExpression(dag, join_expression, planner_context, join_node);
-    join_clause.addCondition(expression_side, node);
+        if (expression_sides.empty() || expression_sides.size() == 1)
+        {
+            auto expression_side = expression_sides.empty() ? JoinTableSide::Right : *expression_sides.begin();
+            auto & dag = expression_side == JoinTableSide::Left ? left_dag : right_dag;
+            const auto * node = appendExpression(dag, join_expression, planner_context, join_node);
+            join_clause.addCondition(expression_side, node);
+        }
+        else
+        {
+            /// expression involves both tables.
+            const auto * node = appendExpression(mixed_dag, join_expression, planner_context, join_node);
+            join_clause.addMixedCondition(node);
+        }
+    }
 }
 
 JoinClausesAndActions buildJoinClausesAndActions(
@@ -337,6 +360,16 @@ JoinClausesAndActions buildJoinClausesAndActions(
 {
     ActionsDAGPtr left_join_actions = std::make_shared<ActionsDAG>(left_table_expression_columns);
     ActionsDAGPtr right_join_actions = std::make_shared<ActionsDAG>(right_table_expression_columns);
+    ColumnsWithTypeAndName mixed_table_expression_columns;
+    for (const auto & left_column : left_table_expression_columns)
+    {
+        mixed_table_expression_columns.push_back(left_column);
+    }
+    for (const auto & right_column : right_table_expression_columns)
+    {
+        mixed_table_expression_columns.push_back(right_column);
+    }
+    ActionsDAGPtr mixed_join_actions = std::make_shared<ActionsDAG>(mixed_table_expression_columns);
 
     /** It is possible to have constant value in JOIN ON section, that we need to ignore during DAG construction.
       * If we do not ignore it, this function will be replaced by underlying constant.
@@ -390,6 +423,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
 
     JoinClausesAndActions result;
 
+    bool is_inequal_join = false;
     const auto & function_name = function_node->getFunction()->getName();
     if (function_name == "or")
     {
@@ -400,12 +434,14 @@ JoinClausesAndActions buildJoinClausesAndActions(
             buildJoinClause(
                 left_join_actions,
                 right_join_actions,
+                mixed_join_actions,
                 planner_context,
                 child,
                 join_left_table_expressions,
                 join_right_table_expressions,
                 join_node,
                 result.join_clauses.back());
+            is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition();
         }
     }
     else
@@ -415,12 +451,14 @@ JoinClausesAndActions buildJoinClausesAndActions(
         buildJoinClause(
                 left_join_actions,
                 right_join_actions,
+                mixed_join_actions,
                 planner_context,
                 join_expression,
                 join_left_table_expressions,
                 join_right_table_expressions,
                 join_node,
                 result.join_clauses.back());
+        is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition();
     }
 
     auto and_function = FunctionFactory::instance().get("and", planner_context->getQueryContext());
@@ -441,7 +479,6 @@ JoinClausesAndActions buildJoinClausesAndActions(
         if (!left_filter_condition_nodes.empty())
         {
             const ActionsDAG::Node * dag_filter_condition_node = nullptr;
-
             if (left_filter_condition_nodes.size() > 1)
                 dag_filter_condition_node = &left_join_actions->addFunction(and_function, left_filter_condition_nodes, {});
             else
@@ -540,6 +577,32 @@ JoinClausesAndActions buildJoinClausesAndActions(
     result.right_join_tmp_expression_actions = std::move(right_join_actions);
     result.right_join_expressions_actions->removeUnusedActions(join_right_actions_names);
 
+    /// If there is any inequal join condition, we need to build full join expressions actions.
+    if (is_inequal_join)
+    {
+        if (result.join_clauses.size() > 1)
+        {
+            auto full_join_expressions_actions = std::make_shared<ActionsDAG>(mixed_table_expression_columns);
+            PlannerActionsVisitor join_expression_visitor(planner_context);
+            auto join_expression_dag_node_raw_pointers = join_expression_visitor.visit(full_join_expressions_actions, join_expression);
+            if (join_expression_dag_node_raw_pointers.size() != 1)
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR, "JOIN {} ON clause contains multiple expressions", join_node.formatASTForErrorMessage());
+
+            full_join_expressions_actions->addOrReplaceInOutputs(*join_expression_dag_node_raw_pointers[0]);
+            Names required_names{join_expression_dag_node_raw_pointers[0]->result_name};
+            full_join_expressions_actions->removeUnusedActions(required_names);
+            result.full_join_expressions_actions = full_join_expressions_actions;
+        }
+        else
+        {
+            const auto & join_clause = result.join_clauses.front();
+            const auto & mixed_filter_condition_nodes = join_clause.getMixedFilterConditionNodes();
+            auto full_join_expressions_actions = ActionsDAG::buildFilterActionsDAG(mixed_filter_condition_nodes, {}, true);
+            result.full_join_expressions_actions = full_join_expressions_actions;
+        }
+    }
+
     return result;
 }
 
@@ -751,6 +814,12 @@ std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> & table_jo
     const Block & right_table_expression_header,
     const PlannerContextPtr & planner_context)
 {
+    if (table_join->getFullJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH))
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+            "JOIN with mixed conditions supports only hash join algorithm");
+    }
+
     trySetStorageInTableJoin(right_table_expression, table_join);
 
     auto & right_table_expression_data = planner_context->getTableExpressionDataOrThrow(right_table_expression);
diff --git a/src/Planner/PlannerJoins.h b/src/Planner/PlannerJoins.h
index 7bc65cfb544..79d8de6a538 100644
--- a/src/Planner/PlannerJoins.h
+++ b/src/Planner/PlannerJoins.h
@@ -9,6 +9,9 @@
 
 #include <Analyzer/IQueryTreeNode.h>
 
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
+
 namespace DB
 {
 
@@ -71,6 +74,7 @@ public:
     /// Add condition for table side
     void addCondition(JoinTableSide table_side, const ActionsDAG::Node * condition_node)
     {
+        LOG_ERROR(getLogger("JoinClause"), "xxx add condition. {}. {}", table_side, condition_node->result_name);
         auto & filter_condition_nodes = table_side == JoinTableSide::Left ? left_filter_condition_nodes : right_filter_condition_nodes;
         filter_condition_nodes.push_back(condition_node);
     }
@@ -140,6 +144,26 @@ public:
         return right_filter_condition_nodes;
     }
 
+    ActionsDAG::NodeRawConstPtrs & getMixedFilterConditionNodes()
+    {
+        return mixed_filter_condition_nodes;
+    }
+
+    void addMixedCondition(const ActionsDAG::Node * condition_node)
+    {
+        mixed_filter_condition_nodes.push_back(condition_node);
+    }
+    
+    const ActionsDAG::NodeRawConstPtrs & getMixedFilterConditionNodes() const
+    {
+        return mixed_filter_condition_nodes;
+    }
+
+    bool hasMixedFilterCondition() const
+    {
+        // return has_mixed_filter_condition;
+        return !mixed_filter_condition_nodes.empty();
+    }
     /// Dump clause into buffer
     void dump(WriteBuffer & buffer) const;
 
@@ -154,6 +178,8 @@ private:
 
     ActionsDAG::NodeRawConstPtrs left_filter_condition_nodes;
     ActionsDAG::NodeRawConstPtrs right_filter_condition_nodes;
+    /// conditions which involve both left and right tables
+    ActionsDAG::NodeRawConstPtrs mixed_filter_condition_nodes;
 
     std::unordered_set<size_t> nullsafe_compare_key_indexes;
 };
@@ -171,6 +197,9 @@ struct JoinClausesAndActions
     ActionsDAGPtr left_join_expressions_actions;
     /// Right join expressions actions
     ActionsDAGPtr right_join_expressions_actions;
+    /// Originally used for inequal join. it's the total join expression.
+    /// If there is no inequal join conditions, it's null.
+    ActionsDAGPtr full_join_expressions_actions;
 };
 
 /** Calculate join clauses and actions for JOIN ON section.

From 4a2ad7beb5cae0489d87fe66c8b80c7b14ac505e Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 7 Mar 2024 09:23:34 +0800
Subject: [PATCH 068/192] fixed bugs

---
 src/Columns/IColumn.h           |  1 -
 src/Core/Block.cpp              | 16 ++++++++--------
 src/Core/Block.h                |  1 +
 src/Interpreters/HashJoin.cpp   |  6 +++---
 src/Interpreters/RowRefs.h      |  7 +------
 src/Interpreters/TableJoin.h    |  9 ++++-----
 src/Planner/PlannerJoinTree.cpp |  1 -
 src/Planner/PlannerJoins.cpp    | 11 +----------
 src/Planner/PlannerJoins.h      |  4 ----
 9 files changed, 18 insertions(+), 38 deletions(-)

diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h
index 20d76e8edd0..cea8d7c9f55 100644
--- a/src/Columns/IColumn.h
+++ b/src/Columns/IColumn.h
@@ -2,7 +2,6 @@
 
 #include <Common/COW.h>
 #include <Common/PODArray_fwd.h>
-#include <Common/PODArray.h>
 #include <Common/Exception.h>
 #include <Common/typeid_cast.h>
 #include <base/StringRef.h>
diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp
index 34984f122a8..96d26e5fbd3 100644
--- a/src/Core/Block.cpp
+++ b/src/Core/Block.cpp
@@ -10,12 +10,12 @@
 #include <Common/FieldVisitorToString.h>
 #include <Common/assert_cast.h>
 
+#include <Processors/Formats/IOutputFormat.h>
+
 #include <iterator>
 
 #include <boost/algorithm/string.hpp>
 
-#include <Interpreters/Context.h>
-#include <Processors/Formats/IOutputFormat.h>
 
 
 namespace DB
@@ -433,12 +433,12 @@ std::string Block::dumpIndex() const
 
 std::string Block::dumpContent() const
 {
-WriteBufferFromOwnString buf;
-   auto output_format = Context::getGlobalContextInstance()->getOutputFormat("PrettyCompact", buf, *this);
-   output_format->write(materializeBlock(*this));
-   output_format->flush();
-   buf.finalize();
-   return buf.str();
+    WriteBufferFromOwnString buf;
+    auto output_format = Context::getGlobalContextInstance()->getOutputFormat("PrettyCompact", buf, *this);
+    output_format->write(materializeBlock(*this));
+    output_format->flush();
+    buf.finalize();
+    return buf.str();
 }
 
 Block Block::cloneEmpty() const
diff --git a/src/Core/Block.h b/src/Core/Block.h
index fb12e64fcb7..9c054b7aea7 100644
--- a/src/Core/Block.h
+++ b/src/Core/Block.h
@@ -213,4 +213,5 @@ Block materializeBlock(const Block & block);
 void materializeBlockInplace(Block & block);
 
 Block concatenateBlocks(const std::vector<Block> & blocks);
+
 }
diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index dc2ddd5a365..744d904353a 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -21,11 +21,13 @@
 #include <DataTypes/DataTypeLowCardinality.h>
 #include <DataTypes/DataTypeTuple.h>
 
+#include <Interpreters/ExpressionActions.h>
 #include <Interpreters/HashJoin.h>
 #include <Interpreters/JoinUtils.h>
 #include <Interpreters/TableJoin.h>
 #include <Interpreters/joinDispatch.h>
 #include <Interpreters/NullableUtils.h>
+#include <Interpreters/RowRefs.h>
 
 #include <Storages/IStorage.h>
 
@@ -34,8 +36,6 @@
 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
 #include <Common/formatReadable.h>
-#include "ExpressionActions.h"
-#include "RowRefs.h"
 
 #include <Functions/FunctionHelpers.h>
 #include <Interpreters/castColumn.h>
@@ -1742,7 +1742,7 @@ NO_INLINE size_t joinRightColumns(
         added_columns.filter = IColumn::Filter(rows, 0);
 
     Arena pool;
-    
+
     if constexpr (join_features.need_replication)
         added_columns.offsets_to_replicate = std::make_unique<IColumn::Offsets>(rows);
 
diff --git a/src/Interpreters/RowRefs.h b/src/Interpreters/RowRefs.h
index b391b77c0b7..650b2311ba7 100644
--- a/src/Interpreters/RowRefs.h
+++ b/src/Interpreters/RowRefs.h
@@ -2,7 +2,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstddef>
 #include <list>
 #include <mutex>
 #include <optional>
@@ -123,7 +122,7 @@ struct RowRefList : RowRef
     };
 
     RowRefList() {} /// NOLINT
-    RowRefList(const Block * block_, size_t row_num_) : RowRef(block_, row_num_), rows(1) {}
+    RowRefList(const Block * block_, size_t row_num_) : RowRef(block_, row_num_) {}
 
     ForwardIterator begin() const { return ForwardIterator(this); }
 
@@ -136,14 +135,10 @@ struct RowRefList : RowRef
             *next = Batch(nullptr);
         }
         next = next->insert(std::move(row_ref), pool);
-        rows += 1;
     }
 
-    size_t toalRows() const { return rows; }
-
 private:
     Batch * next = nullptr;
-    size_t rows = 0;
 };
 
 /**
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 6a449f967fc..fec5c47c954 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -4,6 +4,8 @@
 #include <Core/NamesAndTypes.h>
 #include <Core/SettingsEnums.h>
 #include <Parsers/ASTTablesInSelectQuery.h>
+#include <Interpreters/ActionsDAG.h>
+#include <Interpreters/ExpressionActions.h>
 #include <Interpreters/IJoin.h>
 #include <Interpreters/JoinUtils.h>
 #include <QueryPipeline/SizeLimits.h>
@@ -11,8 +13,6 @@
 #include <Interpreters/IKeyValueEntity.h>
 
 #include <Common/Exception.h>
-#include "ActionsDAG.h"
-#include "ExpressionActions.h"
 #include <Parsers/IAST_fwd.h>
 
 #include <cstddef>
@@ -302,9 +302,8 @@ public:
     std::vector<JoinOnClause> & getClauses() { return clauses; }
     const std::vector<JoinOnClause> & getClauses() const { return clauses; }
 
-    const ExpressionActionsPtr & getFullJoinExpression() const { return full_join_expression;}
-    ExpressionActionsPtr & getFullJoinExpression() { return full_join_expression;}
-
+    const ExpressionActionsPtr & getFullJoinExpression() const { return full_join_expression; }
+    ExpressionActionsPtr & getFullJoinExpression() { return full_join_expression; }
 
     Names getAllNames(JoinTableSide side) const;
 
diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp
index 9c58511f070..1f13bed3694 100644
--- a/src/Planner/PlannerJoinTree.cpp
+++ b/src/Planner/PlannerJoinTree.cpp
@@ -1,4 +1,3 @@
-#include <unordered_map>
 #include <Planner/PlannerJoinTree.h>
 
 #include <Common/scope_guard_safe.h>
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 44a56827450..8e5fca418f9 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -1,4 +1,3 @@
-#include <memory>
 #include <Planner/PlannerJoins.h>
 
 #include <boost/algorithm/string/split.hpp>
@@ -42,7 +41,6 @@
 #include <Planner/PlannerActionsVisitor.h>
 #include <Planner/PlannerContext.h>
 #include <Planner/Utils.h>
-#include "PlannerActionsVisitor.h"
 
 namespace DB
 {
@@ -234,13 +232,6 @@ void buildJoinClause(
 
     if (function_name == "equals" || function_name == "isNotDistinctFrom" || is_asof_join_inequality)
     {
-        if (function_node->getArguments().getNodes().size() != 2)
-        {
-            throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
-                "JOIN {} ON expression expected two arguments",
-                join_node.formatASTForErrorMessage());
-        }
-
         const auto left_child = function_node->getArguments().getNodes().at(0);
         const auto right_child = function_node->getArguments().getNodes().at(1);
 
@@ -253,7 +244,7 @@ void buildJoinClause(
             left_table_expressions,
             right_table_expressions,
             join_node);
-        
+
         if (left_expression_sides.empty() && right_expression_sides.empty())
         {
             throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
diff --git a/src/Planner/PlannerJoins.h b/src/Planner/PlannerJoins.h
index 79d8de6a538..7d3df48af8e 100644
--- a/src/Planner/PlannerJoins.h
+++ b/src/Planner/PlannerJoins.h
@@ -9,9 +9,6 @@
 
 #include <Analyzer/IQueryTreeNode.h>
 
-#include <Poco/Logger.h>
-#include <Common/logger_useful.h>
-
 namespace DB
 {
 
@@ -74,7 +71,6 @@ public:
     /// Add condition for table side
     void addCondition(JoinTableSide table_side, const ActionsDAG::Node * condition_node)
     {
-        LOG_ERROR(getLogger("JoinClause"), "xxx add condition. {}. {}", table_side, condition_node->result_name);
         auto & filter_condition_nodes = table_side == JoinTableSide::Left ? left_filter_condition_nodes : right_filter_condition_nodes;
         filter_condition_nodes.push_back(condition_node);
     }

From f7dd4deca0513357ff0d37ac689ac272a3302b9b Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 7 Mar 2024 14:47:08 +0800
Subject: [PATCH 069/192] Optimization for cutting left blocks into processed
 and not_processed parts.

---
 src/Interpreters/HashJoin.cpp | 212 +++++++++++++++++++++-------------
 1 file changed, 130 insertions(+), 82 deletions(-)

diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index 744d904353a..d92601adf40 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -1,5 +1,6 @@
 #include <any>
 #include <limits>
+#include <memory>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -1494,6 +1495,7 @@ void setUsed(IColumn::Filter & filter [[maybe_unused]], size_t pos [[maybe_unuse
 
 template<typename AddedColumns>
 ColumnPtr buildAdditionFilter(
+    size_t left_start_row,
     const std::vector<std::pair<const Block *, size_t>> & selected_rows,
     const std::vector<size_t> & row_replicate_offset,
     AddedColumns & added_columns)
@@ -1544,7 +1546,7 @@ ColumnPtr buildAdditionFilter(
             size_t rows = left_offset - prev_left_offset;
             if (rows)
             {
-                new_col->insertManyFrom(*src_col->column, i - 1, rows);
+                new_col->insertManyFrom(*src_col->column, left_start_row + i - 1, rows);
             }
             prev_left_offset = left_offset;
         }
@@ -1601,6 +1603,7 @@ void appendFoundRowAll(
     }
 }
 
+/// First to collect all matched rows by join keys, then filter out rows which is not true in additional filter expression.
 template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool multiple_disjuncts, typename AddedColumns>
 NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
     std::vector<KeyGetter> && key_getter_vector,
@@ -1610,120 +1613,165 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
 {
     constexpr JoinFeatures<KIND, STRICTNESS> join_features;
 
-    size_t rows = added_columns.rows_to_add;
+    size_t left_block_rows = added_columns.rows_to_add;
     if constexpr (need_filter)
-        added_columns.filter = IColumn::Filter(rows, 0);
+        added_columns.filter = IColumn::Filter(left_block_rows, 0);
 
-    Arena pool;
+    std::unique_ptr<Arena> pool;
 
     if constexpr (join_features.need_replication)
-        added_columns.offsets_to_replicate = std::make_unique<IColumn::Offsets>(rows);
+        added_columns.offsets_to_replicate = std::make_unique<IColumn::Offsets>(left_block_rows);
 
     std::vector<size_t> row_replicate_offset;
-    row_replicate_offset.reserve(rows);
-    row_replicate_offset.push_back(0);
+    row_replicate_offset.reserve(left_block_rows);
     
     using FindResult = typename KeyGetter::FindResult;
-    IColumn::Offset current_offset = 0;
     size_t max_joined_block_rows = added_columns.max_joined_block_rows;
-    size_t i = 0;
+    size_t left_row_iter = 0;
     std::vector<std::pair<const Block*, size_t>> selected_rows;
-    selected_rows.reserve(rows);
+    selected_rows.reserve(left_block_rows);
     std::vector<FindResult> find_results;
-    /// First, collect matched row refs.
-    for (; i < rows; ++i)
-    {
-        if constexpr (join_features.need_replication)
-        {
-            if (unlikely(current_offset >= max_joined_block_rows))
-            {
-                break;
-            }
-        }
-        KnownRowsHolder<multiple_disjuncts> known_rows;
-        for (size_t onexpr_idx = 0; onexpr_idx < added_columns.join_on_keys.size(); ++onexpr_idx)
-        {
-            const auto & join_keys = added_columns.join_on_keys[onexpr_idx];
-            if (join_keys.null_map && (*join_keys.null_map)[i])
-                continue;
-            
-            bool row_acceptable = !join_keys.isRowFiltered(i);
-            auto find_result = row_acceptable ? key_getter_vector[onexpr_idx].findKey(*(mapv[onexpr_idx]), i, pool) : FindResult();
+    find_results.reserve(left_block_rows);
+    bool exceeded_max_block_rows = false;
+    IColumn::Offset total_added_rows = 0;
+    IColumn::Offset current_added_rows = 0;
 
-            if (find_result.isFound())
+    auto collect_keys_matched_rows_refs = [&]()
+    {
+        pool = std::make_unique<Arena>();
+        find_results.clear();
+        row_replicate_offset.clear();
+        row_replicate_offset.push_back(0);
+        current_added_rows = 0;
+        selected_rows.clear();
+        for (; left_row_iter < left_block_rows; ++left_row_iter)
+        {
+            if constexpr (join_features.need_replication)
             {
-                auto & mapped = find_result.getMapped();
-                find_results.push_back(find_result);
-                if constexpr (join_features.is_all_join)
+                if (unlikely(total_added_rows + current_added_rows >= max_joined_block_rows))
                 {
-                    appendFoundRowAll<multiple_disjuncts, join_features.need_flags>(mapped, selected_rows, current_offset, known_rows, &used_flags);
-                }
-                else
-                {
-                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported join type. kind:{}, strictness:{}", KIND, STRICTNESS);
+                    break;
                 }
             }
-        }
-        row_replicate_offset.push_back(current_offset);
-    }
-
-    /// Second. filtout rows which is not true in additional filter expression.
-    size_t prev_offset = 0;
-    const PaddedPODArray<UInt8> * filter_flags = nullptr;
-    auto filter = buildAdditionFilter(selected_rows, row_replicate_offset, added_columns);
-    if (filter->isNullable())
-    {
-        auto nested_col = typeid_cast<const ColumnNullable &>(*filter).getNestedColumnPtr();
-        filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*nested_col).getData());
-    }
-    else
-    {
-        filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*filter).getData());
-    }
-
-    current_offset = 0;
-    auto row_it = selected_rows.begin();
-    for (size_t j = 1; j < row_replicate_offset.size(); ++j)
-    {
-        bool any_matched = false;
-        for (size_t k = prev_offset; k < row_replicate_offset[j]; ++k)
-        {
-            if ((*filter_flags)[k])
+            KnownRowsHolder<multiple_disjuncts> known_rows;
+            for (size_t join_clause_idx = 0; join_clause_idx < added_columns.join_on_keys.size(); ++join_clause_idx)
             {
-                any_matched = true;
-                added_columns.appendFromBlock(*row_it->first, row_it->second, join_features.add_missing);
-                current_offset += 1;
+                const auto & join_keys = added_columns.join_on_keys[join_clause_idx];
+                if (join_keys.null_map && (*join_keys.null_map)[left_row_iter])
+                    continue;
+
+                bool row_acceptable = !join_keys.isRowFiltered(left_row_iter);
+                auto find_result = row_acceptable
+                    ? key_getter_vector[join_clause_idx].findKey(*(mapv[join_clause_idx]), left_row_iter, *pool)
+                    : FindResult();
+
+                if (find_result.isFound())
+                {
+                    auto & mapped = find_result.getMapped();
+                    find_results.push_back(find_result);
+                    if constexpr (join_features.is_all_join)
+                    {
+                        appendFoundRowAll<multiple_disjuncts, join_features.need_flags>(
+                            mapped, selected_rows, current_added_rows, known_rows, &used_flags);
+                    }
+                    else
+                    {
+                        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported join type. kind:{}, strictness:{}", KIND, STRICTNESS);
+                    }
+                }
             }
-            ++row_it;
+            row_replicate_offset.push_back(current_added_rows);
         }
-        if (!any_matched)
+    };
+
+    auto copy_final_matched_rows = [&](size_t left_start_row, ColumnPtr filter_col)
+    {
+        const PaddedPODArray<UInt8> * filter_flags = nullptr;
+        if (filter_col->isNullable())
         {
-            if constexpr (join_features.is_anti_join && join_features.left)
-                setUsed<need_filter>(added_columns.filter, j - 1);
-            addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, current_offset);
+            auto nested_col = typeid_cast<const ColumnNullable &>(*filter_col).getNestedColumnPtr();
+            filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*nested_col).getData());
         }
         else
         {
-            if constexpr (join_features.is_all_join)
-            {
-                used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_results[j - 1]);
-                setUsed<need_filter>(added_columns.filter, j - 1);
-            }
+            filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*filter_col).getData());
         }
 
+        size_t prev_replicated_row = 0;
+        auto selected_right_row_it = selected_rows.begin();
+        for (size_t i = 1, n = row_replicate_offset.size(); i < n; ++i)
+        {
+            bool any_matched = false;
+            for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
+            {
+                if ((*filter_flags)[replicated_row])
+                {
+                    any_matched = true;
+                    added_columns.appendFromBlock(*selected_right_row_it->first, selected_right_row_it->second, join_features.add_missing);
+                    total_added_rows += 1;
+                }
+                ++selected_right_row_it;
+            }
+            if (!any_matched)
+            {
+                if constexpr (join_features.is_anti_join && join_features.left)
+                    setUsed<need_filter>(added_columns.filter, i - 1);
+                addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, total_added_rows);
+            }
+            else
+            {
+                if constexpr (join_features.is_all_join)
+                {
+                    used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_results[i - 1]);
+                    setUsed<need_filter>(added_columns.filter, left_start_row + i - 1);
+                }
+            }
+
+            if constexpr (join_features.need_replication)
+            {
+                (*added_columns.offsets_to_replicate)[left_start_row + i - 1] = total_added_rows;
+            }
+            prev_replicated_row = row_replicate_offset[i];
+        }
+
+    };
+
+    while (left_row_iter < left_block_rows && !exceeded_max_block_rows)
+    {
+        auto left_start_row = left_row_iter;
+        collect_keys_matched_rows_refs();
+        if (selected_rows.size() != current_added_rows || row_replicate_offset.size() != left_row_iter - left_start_row + 1)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Sizes are mismatched. selected_rows.size:{}, current_added_rows:{}, row_replicate_offset.size:{}, left_row_iter: {}, "
+                "left_start_row: {}",
+                selected_rows.size(),
+                current_added_rows,
+                row_replicate_offset.size(),
+                left_row_iter,
+                left_start_row);
+        }
+        auto filter_col = buildAdditionFilter(left_start_row, selected_rows, row_replicate_offset, added_columns);
+        copy_final_matched_rows(left_start_row, filter_col);
+
         if constexpr (join_features.need_replication)
         {
-            (*added_columns.offsets_to_replicate)[j-1] = current_offset;
+            // Add a check for current_added_rows to avoid run the filter expression on too small size batch.
+            if (total_added_rows >= max_joined_block_rows || current_added_rows < 1024)
+            {
+                exceeded_max_block_rows = true;
+            }
         }
-        prev_offset = row_replicate_offset[j];
     }
+
     if constexpr (join_features.need_replication)
     {
-        added_columns.offsets_to_replicate->resize_assume_reserved(i);
-        added_columns.filter.resize_assume_reserved(i);
+        added_columns.offsets_to_replicate->resize_assume_reserved(left_row_iter);
+        added_columns.filter.resize_assume_reserved(left_row_iter);
     }
     added_columns.applyLazyDefaults();
-    return i;
+    return left_row_iter;
 }
 
 /// Joins right table columns which indexes are present in right_indexes using specified map.

From cd9c87811fe2f5291da5ac5175e11e40f079acfa Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 7 Mar 2024 19:51:54 +0800
Subject: [PATCH 070/192] fixed bugs for all right join

add test cases

fixed typos

add document

fixed stype error and rename some variables

fixed test error

add test cases

try to recude hash join code size

debug

fixed

fixed coredump

fixed. condition result type is not bool

enable when allow_experimental_analyzer=0

fixed tests

fixed

update tests

update tests

roll backup when use the old analyzer

fixed typos

fixed

test

large obj check

test

test
---
 .../sql-reference/statements/select/join.md   |  45 +++
 src/Core/Block.cpp                            |   2 +-
 src/Core/Settings.h                           |   1 +
 src/Interpreters/CollectJoinOnKeysVisitor.h   |   1 -
 src/Interpreters/HashJoin.cpp                 | 282 ++++++++------
 src/Interpreters/HashJoin.h                   |  13 +-
 src/Interpreters/TableJoin.h                  |   9 +-
 src/Planner/PlannerJoinTree.cpp               |   2 +-
 src/Planner/PlannerJoins.cpp                  |  58 ++-
 src/Planner/PlannerJoins.h                    |   3 +-
 ..._join_on_inequal_expression_fast.reference | 366 ++++++++++++++++++
 ...006_join_on_inequal_expression_fast.sql.j2 |  51 +++
 utils/check-style/check-large-objects.sh      |   2 +-
 13 files changed, 695 insertions(+), 140 deletions(-)
 create mode 100644 tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference
 create mode 100644 tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2

diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md
index 95d0489d694..141bb80ceb4 100644
--- a/docs/en/sql-reference/statements/select/join.md
+++ b/docs/en/sql-reference/statements/select/join.md
@@ -164,6 +164,51 @@ Result:
 │ 4 │ -4 │   4 │
 └───┴────┴─────┘
 ```
+## Join with inequality conditions
+Clickhouse currently supports inner, left, right and full join with inequality conditions, including with `OR` operator. You need to set `allow_experimental_analyzer = 1` and select `hash` or `grace_hash` join algorithm.
+
+**Example**
+
+Table `t1`:
+```
+┌─key──┬─attr─┬─a─┬─b─┬─c─┐
+│ key1 │ a    │ 1 │ 1 │ 2 │
+│ key1 │ b    │ 2 │ 3 │ 2 │
+│ key1 │ c    │ 3 │ 2 │ 1 │
+│ key1 │ d    │ 4 │ 7 │ 2 │
+│ key1 │ e    │ 5 │ 5 │ 5 │
+│ key2 │ a2   │ 1 │ 1 │ 1 │
+│ key4 │ f    │ 2 │ 3 │ 4 │
+└──────┴──────┴───┴───┴───┘
+```
+
+Table `t2`
+```
+┌─key──┬─attr─┬─a─┬─b─┬─c─┐
+│ key1 │ A    │ 1 │ 2 │ 1 │
+│ key1 │ B    │ 2 │ 1 │ 2 │
+│ key1 │ C    │ 3 │ 4 │ 5 │
+│ key1 │ D    │ 4 │ 1 │ 6 │
+│ key3 │ a3   │ 1 │ 1 │ 1 │
+│ key4 │ F    │ 1 │ 1 │ 1 │
+└──────┴──────┴───┴───┴───┘
+```
+```sql
+SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+```
+```
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2			0	0	\N
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+```
+
 
 ## NULL values in JOIN keys
 
diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp
index 96d26e5fbd3..bb8225af562 100644
--- a/src/Core/Block.cpp
+++ b/src/Core/Block.cpp
@@ -17,7 +17,6 @@
 #include <boost/algorithm/string.hpp>
 
 
-
 namespace DB
 {
 
@@ -866,4 +865,5 @@ Block concatenateBlocks(const std::vector<Block> & blocks)
     out.setColumns(std::move(columns));
     return out;
 }
+
 }
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 86ac53f07e8..1690add4e2d 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -321,6 +321,7 @@ class IColumn;
     M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0)    \
     \
     M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
+    M(Bool, enable_mixed_join_condition, false, "Support conditions involve columns from both tables in on join expression.", IMPORTANT) \
     \
     M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
     M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
diff --git a/src/Interpreters/CollectJoinOnKeysVisitor.h b/src/Interpreters/CollectJoinOnKeysVisitor.h
index ff10db3beb5..7218f96d781 100644
--- a/src/Interpreters/CollectJoinOnKeysVisitor.h
+++ b/src/Interpreters/CollectJoinOnKeysVisitor.h
@@ -8,7 +8,6 @@
 #include <Parsers/ASTFunction.h>
 #include <Parsers/queryToString.h>
 
-
 namespace DB
 {
 
diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index d92601adf40..823c5d48db9 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -55,6 +55,7 @@ namespace ErrorCodes
     extern const int SET_SIZE_LIMIT_EXCEEDED;
     extern const int TYPE_MISMATCH;
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int INVALID_JOIN_ON_EXPRESSION;
 }
 
 namespace
@@ -124,14 +125,14 @@ namespace JoinStuff
         }
     }
 
-    template <bool use_flags, bool multiple_disjuncts, typename FindResult>
+    template <bool use_flags, bool flag_per_row, typename FindResult>
     void JoinUsedFlags::setUsed(const FindResult & f)
     {
         if constexpr (!use_flags)
             return;
 
         /// Could be set simultaneously from different threads.
-        if constexpr (multiple_disjuncts)
+        if constexpr (flag_per_row)
         {
             auto & mapped = f.getMapped();
             flags[mapped.block][mapped.row_num].store(true, std::memory_order_relaxed);
@@ -142,14 +143,14 @@ namespace JoinStuff
         }
     }
 
-    template <bool use_flags, bool multiple_disjuncts>
+    template <bool use_flags, bool flag_per_row>
     void JoinUsedFlags::setUsed(const Block * block, size_t row_num, size_t offset)
     {
         if constexpr (!use_flags)
             return;
 
         /// Could be set simultaneously from different threads.
-        if constexpr (multiple_disjuncts)
+        if constexpr (flag_per_row)
         {
             flags[block][row_num].store(true, std::memory_order_relaxed);
         }
@@ -159,13 +160,13 @@ namespace JoinStuff
         }
     }
 
-    template <bool use_flags, bool multiple_disjuncts, typename FindResult>
+    template <bool use_flags, bool flag_per_row, typename FindResult>
     bool JoinUsedFlags::getUsed(const FindResult & f)
     {
         if constexpr (!use_flags)
             return true;
 
-        if constexpr (multiple_disjuncts)
+        if constexpr (flag_per_row)
         {
             auto & mapped = f.getMapped();
             return flags[mapped.block][mapped.row_num].load();
@@ -176,13 +177,13 @@ namespace JoinStuff
         }
     }
 
-    template <bool use_flags, bool multiple_disjuncts, typename FindResult>
+    template <bool use_flags, bool flag_per_row, typename FindResult>
     bool JoinUsedFlags::setUsedOnce(const FindResult & f)
     {
         if constexpr (!use_flags)
             return true;
 
-        if constexpr (multiple_disjuncts)
+        if constexpr (flag_per_row)
         {
             auto & mapped = f.getMapped();
 
@@ -711,7 +712,8 @@ void HashJoin::initRightBlockStructure(Block & saved_block_sample)
     bool save_key_columns = table_join->isEnabledAlgorithm(JoinAlgorithm::AUTO) ||
                             table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH) ||
                             isRightOrFull(kind) ||
-                            multiple_disjuncts;
+                            multiple_disjuncts ||
+                            table_join->getFullJoinExpression();
     if (save_key_columns)
     {
         saved_block_sample = right_table_keys.cloneEmpty();
@@ -841,7 +843,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
         if (rows)
             data->empty = false;
 
-        bool multiple_disjuncts = !table_join->oneDisjunct();
+        bool flag_per_row = needUsedFlagsForPerRightTableRow(table_join);
         const auto & onexprs = table_join->getClauses();
         for (size_t onexpr_idx = 0; onexpr_idx < onexprs.size(); ++onexpr_idx)
         {
@@ -865,7 +867,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
             auto join_mask_col = JoinCommon::getColumnAsMask(source_block, onexprs[onexpr_idx].condColumnNames().second);
             /// Save blocks that do not hold conditions in ON section
             ColumnUInt8::MutablePtr not_joined_map = nullptr;
-            if (!multiple_disjuncts && isRightOrFull(kind) && join_mask_col.hasData())
+            if (!flag_per_row && isRightOrFull(kind) && join_mask_col.hasData())
             {
                 const auto & join_mask = join_mask_col.getData();
                 /// Save rows that do not hold conditions
@@ -895,7 +897,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
                         join_mask_col.getData(),
                         data->pool, is_inserted);
 
-                    if (multiple_disjuncts)
+                    if (flag_per_row)
                         used_flags.reinit<kind_, strictness_>(stored_block);
                     else if (is_inserted)
                         /// Number of buckets + 1 value from zero storage
@@ -903,19 +905,19 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
                 });
             }
 
-            if (!multiple_disjuncts && save_nullmap && is_inserted)
+            if (!flag_per_row && save_nullmap && is_inserted)
             {
                 data->blocks_nullmaps_allocated_size += null_map_holder->allocatedBytes();
                 data->blocks_nullmaps.emplace_back(stored_block, null_map_holder);
             }
 
-            if (!multiple_disjuncts && not_joined_map && is_inserted)
+            if (!flag_per_row && not_joined_map && is_inserted)
             {
                 data->blocks_nullmaps_allocated_size += not_joined_map->allocatedBytes();
                 data->blocks_nullmaps.emplace_back(stored_block, std::move(not_joined_map));
             }
 
-            if (!multiple_disjuncts && !is_inserted)
+            if (!flag_per_row && !is_inserted)
             {
                 LOG_TRACE(log, "Skipping inserting block with {} rows", rows);
                 data->blocks_allocated_size -= stored_block->allocatedBytes();
@@ -1351,7 +1353,7 @@ struct JoinFeatures
     static constexpr bool need_flags = MapGetter<KIND, STRICTNESS>::flagged;
 };
 
-template <bool multiple_disjuncts>
+template <bool flag_per_row>
 class KnownRowsHolder;
 
 /// Keep already joined rows to prevent duplication if many disjuncts
@@ -1426,18 +1428,18 @@ public:
     }
 };
 
-template <typename Map, bool add_missing, bool multiple_disjuncts, typename AddedColumns>
+template <typename Map, bool add_missing, bool flag_per_row, typename AddedColumns>
 void addFoundRowAll(
     const typename Map::mapped_type & mapped,
     AddedColumns & added,
     IColumn::Offset & current_offset,
-    KnownRowsHolder<multiple_disjuncts> & known_rows [[maybe_unused]],
+    KnownRowsHolder<flag_per_row> & known_rows [[maybe_unused]],
     JoinStuff::JoinUsedFlags * used_flags [[maybe_unused]])
 {
     if constexpr (add_missing)
         added.applyLazyDefaults();
 
-    if constexpr (multiple_disjuncts)
+    if constexpr (flag_per_row)
     {
         std::unique_ptr<std::vector<KnownRowsHolder<true>::Type>> new_known_rows_ptr;
 
@@ -1454,7 +1456,7 @@ void addFoundRowAll(
                 new_known_rows_ptr->push_back(std::make_pair(it->block, it->row_num));
                 if (used_flags)
                 {
-                    used_flags->JoinStuff::JoinUsedFlags::setUsedOnce<true, multiple_disjuncts>(
+                    used_flags->JoinStuff::JoinUsedFlags::setUsedOnce<true, flag_per_row>(
                         FindResultImpl<const RowRef, false>(*it, true, 0));
                 }
             }
@@ -1494,24 +1496,31 @@ void setUsed(IColumn::Filter & filter [[maybe_unused]], size_t pos [[maybe_unuse
 }
 
 template<typename AddedColumns>
-ColumnPtr buildAdditionFilter(
+ColumnPtr buildAdditionalFilter(
     size_t left_start_row,
-    const std::vector<std::pair<const Block *, size_t>> & selected_rows,
+    const std::vector<RowRef> & selected_rows,
     const std::vector<size_t> & row_replicate_offset,
     AddedColumns & added_columns)
 {
     if (selected_rows.empty())
         return ColumnUInt8::create();
-    const Block & sample_right_block = *selected_rows.begin()->first;
+    const Block & sample_right_block = *selected_rows.begin()->block;
     if (!sample_right_block)
         return ColumnUInt8::create();
 
     auto required_cols = added_columns.additional_filter_expression->getRequiredColumnsWithTypes();
+    if (required_cols.empty())
+    {
+        Block block;
+        added_columns.additional_filter_expression->execute(block);
+        return block.getByPosition(0).column->cloneResized(selected_rows.size());
+    }
     NameSet required_column_names;
     for (auto & col : required_cols)
     {
         required_column_names.insert(col.name);
     }
+
     Block executed_block;
     size_t right_col_pos = 0;
     for (const auto & col : sample_right_block.getColumnsWithTypeAndName())
@@ -1521,8 +1530,8 @@ ColumnPtr buildAdditionFilter(
             auto new_col = col.column->cloneEmpty();
             for (const auto & selected_row : selected_rows)
             {
-                const auto & src_col = selected_row.first->getByPosition(right_col_pos);
-                new_col->insertFrom(*src_col.column, selected_row.second);
+                const auto & src_col = selected_row.block->getByPosition(right_col_pos);
+                new_col->insertFrom(*src_col.column, selected_row.row_num);
             }
             executed_block.insert({std::move(new_col), col.type, col.name});
         }
@@ -1530,7 +1539,7 @@ ColumnPtr buildAdditionFilter(
     }
     if (!executed_block)
     {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected at least one column from right table");
+        return ColumnUInt8::create();
     }
 
     for (const auto & col_name : required_column_names)
@@ -1552,21 +1561,20 @@ ColumnPtr buildAdditionFilter(
         }
         executed_block.insert({std::move(new_col), src_col->type, col_name});
     }
-    // LOG_TRACE(getLogger("HashJoin"), "Additional filter execute block:\n{}", executed_block.dumpContent());
+    LOG_TRACE(getLogger("HashJoin"), "Additional filter execute block:\n{}", executed_block.dumpContent());
     added_columns.additional_filter_expression->execute(executed_block);
-    // LOG_TRACE(getLogger("HashJoin"), "Addition filter execute result block:\n{}", executed_block.dumpContent());
+    LOG_TRACE(getLogger("HashJoin"), "Addition filter execute result block:\n{}", executed_block.dumpContent());
     return executed_block.getByPosition(0).column;
 }
 
-template <bool multiple_disjuncts, bool need_flags>
-void appendFoundRowAll(
+template <bool flag_per_row>
+void addFoundRowRefAll(
     const RowRefList & row_list,
-    std::vector<std::pair<const Block *, size_t>> & selected_rows,
+    std::vector<RowRef> & selected_rows,
     IColumn::Offset & current_offset,
-    KnownRowsHolder<multiple_disjuncts> & known_rows [[maybe_unused]],
-    JoinStuff::JoinUsedFlags * used_flags [[maybe_unused]])
+    KnownRowsHolder<flag_per_row> & known_rows [[maybe_unused]])
 {
-    if constexpr (multiple_disjuncts)
+    if constexpr (flag_per_row)
     {
         std::unique_ptr<std::vector<KnownRowsHolder<true>::Type>> new_known_rows_ptr;
         for (auto it = row_list.begin(); it.ok(); ++it)
@@ -1574,19 +1582,13 @@ void appendFoundRowAll(
             auto row_ref = std::make_pair(it->block, it->row_num);
             if (!known_rows.isKnown(row_ref))
             {
-
-                selected_rows.push_back(row_ref);
+                selected_rows.emplace_back(row_ref.first, row_ref.second);
                 ++current_offset;
                 if (!new_known_rows_ptr)
                 {
                     new_known_rows_ptr = std::make_unique<std::vector<KnownRowsHolder<true>::Type>>();
                 }
                 new_known_rows_ptr->push_back(row_ref);
-                if constexpr (need_flags)
-                {
-                    used_flags->JoinStuff::JoinUsedFlags::setUsedOnce<true, multiple_disjuncts>(
-                        FindResultImpl<const RowRef, false>(*it, true, 0));
-                }
             }
         }
 
@@ -1597,38 +1599,44 @@ void appendFoundRowAll(
     {
         for (auto it = row_list.begin(); it.ok(); ++it)
         {
-            selected_rows.emplace_back(std::pair(it->block, it->row_num));
+            selected_rows.emplace_back(it->block, it->row_num);
             ++current_offset;
         }
     }
 }
 
-/// First to collect all matched rows by join keys, then filter out rows which is not true in additional filter expression.
-template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool multiple_disjuncts, typename AddedColumns>
+/// First to collect all matched rows refs by join keys, then filter out rows which are not true in additional filter expression.
+template <
+    typename KeyGetter,
+    typename Map,
+    bool need_filter,
+    bool need_replication,
+    bool need_flags,
+    bool add_missing,
+    bool flag_per_row,
+    typename AddedColumns>
 NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
     std::vector<KeyGetter> && key_getter_vector,
     const std::vector<const Map *> & mapv,
     AddedColumns & added_columns,
     JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]])
 {
-    constexpr JoinFeatures<KIND, STRICTNESS> join_features;
-
     size_t left_block_rows = added_columns.rows_to_add;
     if constexpr (need_filter)
         added_columns.filter = IColumn::Filter(left_block_rows, 0);
 
     std::unique_ptr<Arena> pool;
 
-    if constexpr (join_features.need_replication)
+    if constexpr (need_replication)
         added_columns.offsets_to_replicate = std::make_unique<IColumn::Offsets>(left_block_rows);
 
     std::vector<size_t> row_replicate_offset;
     row_replicate_offset.reserve(left_block_rows);
-    
+
     using FindResult = typename KeyGetter::FindResult;
     size_t max_joined_block_rows = added_columns.max_joined_block_rows;
     size_t left_row_iter = 0;
-    std::vector<std::pair<const Block*, size_t>> selected_rows;
+    std::vector<RowRef> selected_rows;
     selected_rows.reserve(left_block_rows);
     std::vector<FindResult> find_results;
     find_results.reserve(left_block_rows);
@@ -1646,14 +1654,14 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
         selected_rows.clear();
         for (; left_row_iter < left_block_rows; ++left_row_iter)
         {
-            if constexpr (join_features.need_replication)
+            if constexpr (need_replication)
             {
                 if (unlikely(total_added_rows + current_added_rows >= max_joined_block_rows))
                 {
                     break;
                 }
             }
-            KnownRowsHolder<multiple_disjuncts> known_rows;
+            KnownRowsHolder<flag_per_row> known_rows;
             for (size_t join_clause_idx = 0; join_clause_idx < added_columns.join_on_keys.size(); ++join_clause_idx)
             {
                 const auto & join_keys = added_columns.join_on_keys[join_clause_idx];
@@ -1669,15 +1677,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                 {
                     auto & mapped = find_result.getMapped();
                     find_results.push_back(find_result);
-                    if constexpr (join_features.is_all_join)
-                    {
-                        appendFoundRowAll<multiple_disjuncts, join_features.need_flags>(
-                            mapped, selected_rows, current_added_rows, known_rows, &used_flags);
-                    }
-                    else
-                    {
-                        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported join type. kind:{}, strictness:{}", KIND, STRICTNESS);
-                    }
+                    addFoundRowRefAll<flag_per_row>(mapped, selected_rows, current_added_rows, known_rows);
                 }
             }
             row_replicate_offset.push_back(current_added_rows);
@@ -1687,6 +1687,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
     auto copy_final_matched_rows = [&](size_t left_start_row, ColumnPtr filter_col)
     {
         const PaddedPODArray<UInt8> * filter_flags = nullptr;
+        filter_col = filter_col->convertToFullIfNeeded();
         if (filter_col->isNullable())
         {
             auto nested_col = typeid_cast<const ColumnNullable &>(*filter_col).getNestedColumnPtr();
@@ -1699,41 +1700,58 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
 
         size_t prev_replicated_row = 0;
         auto selected_right_row_it = selected_rows.begin();
+        size_t find_result_index = 0;
         for (size_t i = 1, n = row_replicate_offset.size(); i < n; ++i)
         {
             bool any_matched = false;
-            for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
+            /// For all right join, flag_per_row is true, we need mark used flags for each row.
+            if constexpr (flag_per_row)
             {
-                if ((*filter_flags)[replicated_row])
+                for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
                 {
-                    any_matched = true;
-                    added_columns.appendFromBlock(*selected_right_row_it->first, selected_right_row_it->second, join_features.add_missing);
-                    total_added_rows += 1;
+                    if ((*filter_flags)[replicated_row])
+                    {
+                        any_matched = true;
+                        added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing);
+                        total_added_rows += 1;
+                        used_flags.template setUsed<need_flags, flag_per_row>(selected_right_row_it->block, selected_right_row_it->row_num, 0);
+                    }
+                    ++selected_right_row_it;
                 }
-                ++selected_right_row_it;
-            }
-            if (!any_matched)
-            {
-                if constexpr (join_features.is_anti_join && join_features.left)
-                    setUsed<need_filter>(added_columns.filter, i - 1);
-                addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, total_added_rows);
             }
             else
             {
-                if constexpr (join_features.is_all_join)
+                for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
                 {
-                    used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_results[i - 1]);
-                    setUsed<need_filter>(added_columns.filter, left_start_row + i - 1);
+                    if ((*filter_flags)[replicated_row])
+                    {
+                        any_matched = true;
+                        added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing);
+                        total_added_rows += 1;
+                    }
+                    ++selected_right_row_it;
                 }
             }
+            if (!any_matched)
+            {
+                addNotFoundRow<add_missing, need_replication>(added_columns, total_added_rows);
+            }
+            else
+            {
+                if constexpr (!flag_per_row)
+                    used_flags.template setUsed<need_flags, false>(find_results[find_result_index]);
+                setUsed<need_filter>(added_columns.filter, left_start_row + i - 1);
+                if constexpr (add_missing)
+                    added_columns.applyLazyDefaults();
+            }
+            find_result_index += (prev_replicated_row != row_replicate_offset[i]);
 
-            if constexpr (join_features.need_replication)
+            if constexpr (need_replication)
             {
                 (*added_columns.offsets_to_replicate)[left_start_row + i - 1] = total_added_rows;
             }
             prev_replicated_row = row_replicate_offset[i];
         }
-
     };
 
     while (left_row_iter < left_block_rows && !exceeded_max_block_rows)
@@ -1752,10 +1770,10 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                 left_row_iter,
                 left_start_row);
         }
-        auto filter_col = buildAdditionFilter(left_start_row, selected_rows, row_replicate_offset, added_columns);
+        auto filter_col = buildAdditionalFilter(left_start_row, selected_rows, row_replicate_offset, added_columns);
         copy_final_matched_rows(left_start_row, filter_col);
 
-        if constexpr (join_features.need_replication)
+        if constexpr (need_replication)
         {
             // Add a check for current_added_rows to avoid run the filter expression on too small size batch.
             if (total_added_rows >= max_joined_block_rows || current_added_rows < 1024)
@@ -1765,7 +1783,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
         }
     }
 
-    if constexpr (join_features.need_replication)
+    if constexpr (need_replication)
     {
         added_columns.offsets_to_replicate->resize_assume_reserved(left_row_iter);
         added_columns.filter.resize_assume_reserved(left_row_iter);
@@ -1776,7 +1794,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
 
 /// Joins right table columns which indexes are present in right_indexes using specified map.
 /// Makes filter (1 if row presented in right table) and returns offsets to replicate (for ALL JOINS).
-template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool multiple_disjuncts, typename AddedColumns>
+template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, bool need_filter, bool flag_per_row, typename AddedColumns>
 NO_INLINE size_t joinRightColumns(
     std::vector<KeyGetter> && key_getter_vector,
     const std::vector<const Map *> & mapv,
@@ -1811,7 +1829,7 @@ NO_INLINE size_t joinRightColumns(
 
         bool right_row_found = false;
 
-        KnownRowsHolder<multiple_disjuncts> known_rows;
+        KnownRowsHolder<flag_per_row> known_rows;
         for (size_t onexpr_idx = 0; onexpr_idx < added_columns.join_on_keys.size(); ++onexpr_idx)
         {
             const auto & join_keys = added_columns.join_on_keys[onexpr_idx];
@@ -1834,10 +1852,10 @@ NO_INLINE size_t joinRightColumns(
                     if (row_ref.block)
                     {
                         setUsed<need_filter>(added_columns.filter, i);
-                        if constexpr (multiple_disjuncts)
-                            used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(row_ref.block, row_ref.row_num, 0);
+                        if constexpr (flag_per_row)
+                            used_flags.template setUsed<join_features.need_flags, flag_per_row>(row_ref.block, row_ref.row_num, 0);
                         else
-                            used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result);
+                            used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
 
                         added_columns.appendFromBlock(*row_ref.block, row_ref.row_num, join_features.add_missing);
                     }
@@ -1847,14 +1865,14 @@ NO_INLINE size_t joinRightColumns(
                 else if constexpr (join_features.is_all_join)
                 {
                     setUsed<need_filter>(added_columns.filter, i);
-                    used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result);
+                    used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
                     auto used_flags_opt = join_features.need_flags ? &used_flags : nullptr;
                     addFoundRowAll<Map, join_features.add_missing>(mapped, added_columns, current_offset, known_rows, used_flags_opt);
                 }
                 else if constexpr ((join_features.is_any_join || join_features.is_semi_join) && join_features.right)
                 {
                     /// Use first appeared left key + it needs left columns replication
-                    bool used_once = used_flags.template setUsedOnce<join_features.need_flags, multiple_disjuncts>(find_result);
+                    bool used_once = used_flags.template setUsedOnce<join_features.need_flags, flag_per_row>(find_result);
                     if (used_once)
                     {
                         auto used_flags_opt = join_features.need_flags ? &used_flags : nullptr;
@@ -1864,7 +1882,7 @@ NO_INLINE size_t joinRightColumns(
                 }
                 else if constexpr (join_features.is_any_join && KIND == JoinKind::Inner)
                 {
-                    bool used_once = used_flags.template setUsedOnce<join_features.need_flags, multiple_disjuncts>(find_result);
+                    bool used_once = used_flags.template setUsedOnce<join_features.need_flags, flag_per_row>(find_result);
 
                     /// Use first appeared left key only
                     if (used_once)
@@ -1882,12 +1900,12 @@ NO_INLINE size_t joinRightColumns(
                 else if constexpr (join_features.is_anti_join)
                 {
                     if constexpr (join_features.right && join_features.need_flags)
-                        used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result);
+                        used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
                 }
                 else /// ANY LEFT, SEMI LEFT, old ANY (RightAny)
                 {
                     setUsed<need_filter>(added_columns.filter, i);
-                    used_flags.template setUsed<join_features.need_flags, multiple_disjuncts>(find_result);
+                    used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
                     added_columns.appendFromBlock(*mapped.block, mapped.row_num, join_features.add_missing);
 
                     if (join_features.is_any_or_semi_join)
@@ -1922,19 +1940,46 @@ size_t joinRightColumnsSwitchMultipleDisjuncts(
     AddedColumns & added_columns,
     JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]])
 {
-    if (added_columns.additional_filter_expression)
-    {
-        return mapv.size() > 1 ? joinRightColumnsWithAddtitionalFilter<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(
-                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
-                               : joinRightColumnsWithAddtitionalFilter<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(
-                                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
-    }
-    else
+    auto join_without_additional_filter = [&]()
     {
         return mapv.size() > 1 ? joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(
                    std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
                                : joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(
                                    std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
+    };
+
+    constexpr JoinFeatures<KIND, STRICTNESS> join_features;
+    if constexpr (join_features.is_all_join)
+    {
+        if (added_columns.additional_filter_expression)
+        {
+           constexpr bool mark_per_row_used = join_features.right || join_features.full;
+           return mapv.size() > 1
+               ? joinRightColumnsWithAddtitionalFilter<
+                   KeyGetter,
+                   Map,
+                   need_filter,
+                   join_features.need_replication,
+                   join_features.need_flags,
+                   join_features.add_missing,
+                   true>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
+               : joinRightColumnsWithAddtitionalFilter<
+                   KeyGetter,
+                   Map,
+                   need_filter,
+                   join_features.need_replication,
+                   join_features.need_flags,
+                   join_features.add_missing,
+                   mark_per_row_used>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
+        }
+        else
+        {
+            return join_without_additional_filter();
+        }
+    }
+    else
+    {
+        return join_without_additional_filter();
     }
 }
 
@@ -2421,10 +2466,10 @@ struct AdderNonJoined
 class NotJoinedHash final : public NotJoinedBlocks::RightColumnsFiller
 {
 public:
-    NotJoinedHash(const HashJoin & parent_, UInt64 max_block_size_, bool multiple_disjuncts_)
+    NotJoinedHash(const HashJoin & parent_, UInt64 max_block_size_, bool flag_per_row_)
         : parent(parent_)
         , max_block_size(max_block_size_)
-        , multiple_disjuncts(multiple_disjuncts_)
+        , flag_per_row(flag_per_row_)
         , current_block_start(0)
     {
         if (parent.data == nullptr)
@@ -2451,7 +2496,7 @@ public:
                 throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown JOIN strictness '{}' (must be on of: ANY, ALL, ASOF)", parent.strictness);
         }
 
-        if (!multiple_disjuncts)
+        if (!flag_per_row)
         {
             fillNullsFromBlocks(columns_right, rows_added);
         }
@@ -2462,7 +2507,7 @@ public:
 private:
     const HashJoin & parent;
     UInt64 max_block_size;
-    bool multiple_disjuncts;
+    bool flag_per_row;
 
     size_t current_block_start;
 
@@ -2528,7 +2573,7 @@ private:
     {
         size_t rows_added = 0;
 
-        if (multiple_disjuncts)
+        if (flag_per_row)
         {
             if (!used_position.has_value())
                 used_position = parent.data->blocks.begin();
@@ -2620,8 +2665,8 @@ IBlocksStreamPtr HashJoin::getNonJoinedBlocks(const Block & left_sample_block,
         return {};
     size_t left_columns_count = left_sample_block.columns();
 
-    bool multiple_disjuncts = !table_join->oneDisjunct();
-    if (!multiple_disjuncts)
+    bool flag_per_row = needUsedFlagsForPerRightTableRow(table_join);
+    if (!flag_per_row)
     {
         /// With multiple disjuncts, all keys are in sample_block_with_columns_to_add, so invariant is not held
         size_t expected_columns_count = left_columns_count + required_right_keys.columns() + sample_block_with_columns_to_add.columns();
@@ -2633,7 +2678,7 @@ IBlocksStreamPtr HashJoin::getNonJoinedBlocks(const Block & left_sample_block,
         }
     }
 
-    auto non_joined = std::make_unique<NotJoinedHash>(*this, max_block_size, multiple_disjuncts);
+    auto non_joined = std::make_unique<NotJoinedHash>(*this, max_block_size, flag_per_row);
     return std::make_unique<NotJoinedBlocks>(std::move(non_joined), result_sample_block, left_columns_count, *table_join);
 }
 
@@ -2642,8 +2687,8 @@ void HashJoin::reuseJoinedData(const HashJoin & join)
     data = join.data;
     from_storage_join = true;
 
-    bool multiple_disjuncts = !table_join->oneDisjunct();
-    if (multiple_disjuncts)
+    bool flag_per_row = needUsedFlagsForPerRightTableRow(table_join);
+    if (flag_per_row)
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "StorageJoin with ORs is not supported");
 
     for (auto & map : data->maps)
@@ -2724,17 +2769,28 @@ void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additiona
     if (!type->equals(*std::make_shared<DataTypeUInt8>()))
     {
         throw Exception(ErrorCodes::LOGICAL_ERROR,
-            "Unexpected expression in JOIN ON section. Expected boolean (UInt8), got '{}'",
-            expression_sample_block.getByPosition(0).type->getName());
+            "Unexpected expression in JOIN ON section. Expected boolean (UInt8), got '{}'. expression:\n{}",
+            expression_sample_block.getByPosition(0).type->getName(),
+            additional_filter_expression->dumpActions());
     }
 
-    bool is_supported = (strictness == JoinStrictness::All) && (kind == JoinKind::Inner || kind == JoinKind::Left || kind == JoinKind::Right);
+    bool is_supported = (strictness == JoinStrictness::All) && (isInnerOrLeft(kind) || isRightOrFull(kind));
     if (!is_supported)
     {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-            "Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/RIGHT JOINs",
+        throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
+            "Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/FULL/RIGHT JOINs.",
             expression_sample_block.getByPosition(0).name);
     }
 }
 
+bool HashJoin::needUsedFlagsForPerRightTableRow(std::shared_ptr<TableJoin> table_join_) const
+{
+    if (!table_join_->oneDisjunct())
+        return true;
+    /// If it'a a all right join with inequal conditions, we need to mark each row
+    if (table_join_->getFullJoinExpression() && isRightOrFull(table_join_->kind()))
+        return true;
+    return false;
+}
+
 }
diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h
index 2e5c7c12323..aa9bb0a5aa1 100644
--- a/src/Interpreters/HashJoin.h
+++ b/src/Interpreters/HashJoin.h
@@ -9,7 +9,6 @@
 
 #include <Interpreters/IJoin.h>
 #include <Interpreters/AggregationCommon.h>
-#include <Interpreters/ExpressionActions.h>
 #include <Interpreters/RowRefs.h>
 
 #include <Common/Arena.h>
@@ -32,6 +31,7 @@ namespace DB
 {
 
 class TableJoin;
+class ExpressionActions;
 
 namespace JoinStuff
 {
@@ -61,16 +61,16 @@ public:
     bool getUsedSafe(size_t i) const;
     bool getUsedSafe(const Block * block_ptr, size_t row_idx) const;
 
-    template <bool use_flags, bool multiple_disjuncts, typename T>
+    template <bool use_flags, bool flag_per_row, typename T>
     void setUsed(const T & f);
 
-    template <bool use_flags, bool multiple_disjunct>
+    template <bool use_flags, bool flag_per_row>
     void setUsed(const Block * block, size_t row_num, size_t offset);
 
-    template <bool use_flags, bool multiple_disjuncts, typename T>
+    template <bool use_flags, bool flag_per_row, typename T>
     bool getUsed(const T & f);
 
-    template <bool use_flags, bool multiple_disjuncts, typename T>
+    template <bool use_flags, bool flag_per_row, typename T>
     bool setUsedOnce(const T & f);
 };
 
@@ -472,7 +472,8 @@ private:
 
     bool empty() const;
 
-    void validateAdditionalFilterExpression(ExpressionActionsPtr additional_filter_expression);
+    void validateAdditionalFilterExpression(std::shared_ptr<ExpressionActions> additional_filter_expression);
+    bool needUsedFlagsForPerRightTableRow(std::shared_ptr<TableJoin> table_join_) const;
 };
 
 }
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index fec5c47c954..191e4fb1908 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -4,8 +4,6 @@
 #include <Core/NamesAndTypes.h>
 #include <Core/SettingsEnums.h>
 #include <Parsers/ASTTablesInSelectQuery.h>
-#include <Interpreters/ActionsDAG.h>
-#include <Interpreters/ExpressionActions.h>
 #include <Interpreters/IJoin.h>
 #include <Interpreters/JoinUtils.h>
 #include <QueryPipeline/SizeLimits.h>
@@ -30,6 +28,7 @@ class ASTSelectQuery;
 struct DatabaseAndTableWithAlias;
 class Block;
 class DictionaryJoinAdapter;
+class ExpressionActions;
 class StorageJoin;
 class StorageDictionary;
 class IKeyValueEntity;
@@ -156,7 +155,7 @@ private:
 
     Clauses clauses;
     /// Originally used for inequal join. If there is no any inequal join condition, it will be nullptr.
-    ExpressionActionsPtr full_join_expression = nullptr;
+    std::shared_ptr<ExpressionActions> full_join_expression = nullptr;
 
     ASTTableJoin table_join;
 
@@ -302,8 +301,8 @@ public:
     std::vector<JoinOnClause> & getClauses() { return clauses; }
     const std::vector<JoinOnClause> & getClauses() const { return clauses; }
 
-    const ExpressionActionsPtr & getFullJoinExpression() const { return full_join_expression; }
-    ExpressionActionsPtr & getFullJoinExpression() { return full_join_expression; }
+    const std::shared_ptr<ExpressionActions> & getFullJoinExpression() const { return full_join_expression; }
+    std::shared_ptr<ExpressionActions> & getFullJoinExpression() { return full_join_expression; }
 
     Names getAllNames(JoinTableSide side) const;
 
diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp
index 1f13bed3694..7b21cb4bef4 100644
--- a/src/Planner/PlannerJoinTree.cpp
+++ b/src/Planner/PlannerJoinTree.cpp
@@ -1323,7 +1323,7 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_
             ExpressionActionsPtr & full_join_expression = table_join->getFullJoinExpression();
             full_join_expression = std::make_shared<ExpressionActions>(
                 join_clauses_and_actions.full_join_expressions_actions,
-                ExpressionActionsSettings::fromContext(planner_context->getQueryContext(), CompileExpressions::no));
+                ExpressionActionsSettings::fromContext(planner_context->getQueryContext()));
         }
     }
     else if (join_node.isUsingJoinExpression())
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 8e5fca418f9..8fc1a187701 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -10,6 +10,7 @@
 
 #include <DataTypes/getLeastSupertype.h>
 #include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
 
 #include <Storages/IStorage.h>
 #include <Storages/StorageJoin.h>
@@ -313,10 +314,21 @@ void buildJoinClause(
         }
         else
         {
-            /// expression involves both tables.
-            /// `expr1(left.col1, right.col2) == expr2(left.col3, right.col4)`
-            const auto * node = appendExpression(mixed_dag, join_expression, planner_context, join_node);
-            join_clause.addMixedCondition(node);
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().enable_mixed_join_condition;
+            if (support_mixed_join_condition)
+            {
+                /// expression involves both tables.
+                /// `expr1(left.col1, right.col2) == expr2(left.col3, right.col4)`
+                const auto * node = appendExpression(mixed_dag, join_expression, planner_context, join_node);
+                join_clause.addMixedCondition(node);
+            }
+            else
+            {
+                throw Exception(
+                    ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
+                    "JOIN {} join expression contains column from left and right table",
+                    join_node.formatASTForErrorMessage());
+            }
         }
 
     }
@@ -326,7 +338,7 @@ void buildJoinClause(
             left_table_expressions,
             right_table_expressions,
             join_node);
-
+        // expression_sides.empty() = true, the expression is constant
         if (expression_sides.empty() || expression_sides.size() == 1)
         {
             auto expression_side = expression_sides.empty() ? JoinTableSide::Right : *expression_sides.begin();
@@ -336,9 +348,20 @@ void buildJoinClause(
         }
         else
         {
-            /// expression involves both tables.
-            const auto * node = appendExpression(mixed_dag, join_expression, planner_context, join_node);
-            join_clause.addMixedCondition(node);
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().enable_mixed_join_condition;
+            if (support_mixed_join_condition)
+            {
+                /// expression involves both tables.
+                const auto * node = appendExpression(mixed_dag, join_expression, planner_context, join_node);
+                join_clause.addMixedCondition(node);
+            }
+            else
+            {
+                throw Exception(
+                    ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
+                    "JOIN {} join expression contains column from left and right table",
+                    join_node.formatASTForErrorMessage());
+            }
         }
     }
 }
@@ -592,6 +615,20 @@ JoinClausesAndActions buildJoinClausesAndActions(
             auto full_join_expressions_actions = ActionsDAG::buildFilterActionsDAG(mixed_filter_condition_nodes, {}, true);
             result.full_join_expressions_actions = full_join_expressions_actions;
         }
+        auto outputs = result.full_join_expressions_actions->getOutputs();
+        if (outputs.size() != 1)
+        {
+            throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, "Only one output is expected. but got:\n{}", result.full_join_expressions_actions->dumpDAG());
+        }
+        auto output_type = removeNullable(outputs[0]->result_type);
+        WhichDataType which_type(output_type);
+        if (!which_type.isUInt8())
+        {
+            DataTypePtr uint8_ty = std::make_shared<DataTypeUInt8>();
+            auto true_col = ColumnWithTypeAndName(uint8_ty->createColumnConst(1, 1), uint8_ty, "true");
+            const auto * true_node = &result.full_join_expressions_actions->addColumn(true_col);
+            result.full_join_expressions_actions = ActionsDAG::buildFilterActionsDAG({outputs[0], true_node});
+        }
     }
 
     return result;
@@ -805,10 +842,11 @@ std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> & table_jo
     const Block & right_table_expression_header,
     const PlannerContextPtr & planner_context)
 {
-    if (table_join->getFullJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH))
+    if (table_join->getFullJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH)
+        && !(table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH) && table_join->oneDisjunct()))
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-            "JOIN with mixed conditions supports only hash join algorithm");
+            "JOIN with mixed conditions supports only hash join or grace hash join with one disjunct.");
     }
 
     trySetStorageInTableJoin(right_table_expression, table_join);
diff --git a/src/Planner/PlannerJoins.h b/src/Planner/PlannerJoins.h
index 7d3df48af8e..41736047a14 100644
--- a/src/Planner/PlannerJoins.h
+++ b/src/Planner/PlannerJoins.h
@@ -149,7 +149,7 @@ public:
     {
         mixed_filter_condition_nodes.push_back(condition_node);
     }
-    
+
     const ActionsDAG::NodeRawConstPtrs & getMixedFilterConditionNodes() const
     {
         return mixed_filter_condition_nodes;
@@ -157,7 +157,6 @@ public:
 
     bool hasMixedFilterCondition() const
     {
-        // return has_mixed_filter_condition;
         return !mixed_filter_condition_nodes.empty();
     }
     /// Dump clause into buffer
diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference
new file mode 100644
index 00000000000..806596f8a63
--- /dev/null
+++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference
@@ -0,0 +1,366 @@
+-- { echoOn }
+SET join_algorithm='hash';
+SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2			0	0	\N
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key1	A	1	2	1
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key1	A	1	2	1
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2			0	0	\N
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SET join_algorithm='grace_hash';
+SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2			0	0	\N
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key1	A	1	2	1
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4	key4	F	1	1	1
+SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	A	1	2	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	c	3	2	1	key1	B	2	1	2
+key1	c	3	2	1	key1	C	3	4	5
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+		0	0	\N	key1	A	1	2	1
+		0	0	\N	key3	a3	1	1	1
+		0	0	\N	key4	F	1	1	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2			0	0	\N
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1			0	0	\N
+key4	f	2	3	4			0	0	\N
+SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+1	1	1	1	1	1
+SET join_algorithm='hash';
+SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	a	1	1	2	key3	a3	1	1	1
+key1	a	1	1	2	key4	F	1	1	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1	key1	A	1	2	1
+key2	a2	1	1	1	key3	a3	1	1	1
+key2	a2	1	1	1	key4	F	1	1	1
+key4	f	2	3	4	key1	B	2	1	2
+SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	a	1	1	2	key3	a3	1	1	1
+key1	a	1	1	2	key4	F	1	1	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key2	a2	1	1	1	key1	A	1	2	1
+key2	a2	1	1	1	key3	a3	1	1	1
+key2	a2	1	1	1	key4	F	1	1	1
+key4	f	2	3	4	key1	B	2	1	2
+SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	a	1	1	2	key3	a3	1	1	1
+key1	a	1	1	2	key4	F	1	1	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key2	a2	1	1	1	key1	A	1	2	1
+key2	a2	1	1	1	key3	a3	1	1	1
+key2	a2	1	1	1	key4	F	1	1	1
+key4	f	2	3	4	key1	B	2	1	2
+SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+key1	a	1	1	2	key1	A	1	2	1
+key1	a	1	1	2	key1	B	2	1	2
+key1	a	1	1	2	key1	C	3	4	5
+key1	a	1	1	2	key1	D	4	1	6
+key1	a	1	1	2	key3	a3	1	1	1
+key1	a	1	1	2	key4	F	1	1	1
+key1	b	2	3	2	key1	B	2	1	2
+key1	b	2	3	2	key1	C	3	4	5
+key1	b	2	3	2	key1	D	4	1	6
+key1	c	3	2	1	key1	C	3	4	5
+key1	c	3	2	1	key1	D	4	1	6
+key1	d	4	7	2	key1	D	4	1	6
+key1	e	5	5	5			0	0	\N
+key2	a2	1	1	1	key1	A	1	2	1
+key2	a2	1	1	1	key3	a3	1	1	1
+key2	a2	1	1	1	key4	F	1	1	1
+key4	f	2	3	4	key1	B	2	1	2
diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
new file mode 100644
index 00000000000..a85b7558b21
--- /dev/null
+++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
@@ -0,0 +1,51 @@
+DROP TABLE IF EXISTS t1;
+DROP TABLE IF EXISTS t2;
+
+CREATE TABLE t1 (key String, attr String, a UInt64, b UInt64, c Nullable(UInt64)) ENGINE = MergeTree ORDER BY key;
+INSERT INTO t1 VALUES ('key1', 'a', 1, 1, 2), ('key1', 'b', 2, 3, 2), ('key1', 'c', 3, 2, 1), ('key1', 'd', 4, 7, 2), ('key1', 'e', 5, 5, 5), ('key2', 'a2', 1, 1, 1), ('key4', 'f', 2, 3, 4);
+CREATE TABLE t2 (key String, attr String, a UInt64, b UInt64, c Nullable(UInt64)) ENGINE = MergeTree ORDER BY key;
+INSERT INTO t2 VALUES ('key1', 'A', 1, 2, 1), ('key1', 'B', 2, 1, 2), ('key1', 'C', 3, 4, 5), ('key1', 'D', 4, 1, 6), ('key3', 'a3', 1, 1, 1), ('key4', 'F', 1,1,1);
+
+SET allow_experimental_analyzer=1;
+SET enable_mixed_join_condition=1;
+-- { echoOn }
+{% for algorithm in ['hash', 'grace_hash'] -%}
+SET join_algorithm='{{ algorithm }}';
+{% for join_type in ['LEFT', 'INNER', 'RIGHT', 'FULL'] -%}
+SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION }
+SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 {{ join_type }} JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0);
+{% endfor -%}
+{% endfor -%}
+
+
+{% for algorithm in ['hash'] -%}
+SET join_algorithm='{{ algorithm }}';
+{% for join_type in ['LEFT', 'INNER', 'RIGHT', 'FULL'] -%}
+SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
+{% endfor -%}
+{% endfor -%}
+-- { echoOff }
+
+-- test error messages
+{% for algorithm in ['partial_merge', 'full_sorting_merge', 'parallel_hash', 'auto', 'direct'] -%}
+SET join_algorithm='{{ algorithm }}';
+{% for join_type in ['LEFT', 'INNER', 'RIGHT', 'FULL'] -%}
+SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError NOT_IMPLEMENTED }
+SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.b + t2.b == t1.c + t2.c) ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError NOT_IMPLEMENTED }
+SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError NOT_IMPLEMENTED }
+{% endfor -%}
+{% endfor -%}
+
+{% for algorithm in ['grace_hash', 'partial_merge', 'full_sorting_merge', 'parallel_hash', 'auto', 'direct'] -%}
+SET join_algorithm='{{ algorithm }}';
+{% for join_type in ['LEFT', 'INNER', 'RIGHT', 'FULL'] -%}
+SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError NOT_IMPLEMENTED }
+{% endfor -%}
+{% endfor -%}
+
+
+DROP TABLE IF EXISTS t1;
+DROP TABLE IF EXISTS t2;
diff --git a/utils/check-style/check-large-objects.sh b/utils/check-style/check-large-objects.sh
index 2122cca911e..a1e05c6ad05 100755
--- a/utils/check-style/check-large-objects.sh
+++ b/utils/check-style/check-large-objects.sh
@@ -9,7 +9,7 @@ TU_EXCLUDES=(
     Aggregator
 )
 
-if find $1 -name '*.o' | xargs wc -c | grep --regexp='\.o$' | sort -rn | awk '{ if ($1 > 50000000) print }' \
+if find $1 -name '*.o' | xargs wc -c | grep --regexp='\.o$' | sort -rn | awk '{ if ($1 > 100000000) print }' \
     | grep -v -f <(printf "%s\n" "${TU_EXCLUDES[@]}")
 then
     echo "^ It's not allowed to have so large translation units."

From c51e9163a56c6699e28d1577843900352d828e4c Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Tue, 19 Mar 2024 15:04:34 +0800
Subject: [PATCH 071/192] reduce hash join code size

add check

debug

fixed

fixed

fixed

fixed

fixed

debug

debug

update

rename full_expression

rollback ExecutionContext

experiment: support join_use_nulls
---
 src/Analyzer/Passes/QueryAnalysisPass.cpp     |   3 +-
 src/Core/Block.cpp                            |  12 --
 src/Core/Block.h                              |   3 -
 src/Core/Settings.h                           |   2 +-
 src/Core/SettingsChangesHistory.h             |   1 +
 src/Interpreters/ExpressionActions.cpp        |  15 ++-
 src/Interpreters/HashJoin.cpp                 | 118 ++++++++++++------
 src/Interpreters/TableJoin.h                  |   6 +-
 src/Planner/PlannerJoinTree.cpp               |  49 +++++++-
 src/Planner/PlannerJoins.cpp                  |  30 ++---
 src/Planner/PlannerJoins.h                    |   2 +-
 ...006_join_on_inequal_expression_fast.sql.j2 |   2 +-
 utils/check-style/check-large-objects.sh      |   2 +-
 13 files changed, 161 insertions(+), 84 deletions(-)

diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp
index a3c60c76132..f932a0de006 100644
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@@ -3651,7 +3651,8 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoin(const IdentifierLoo
         }
     }
 
-    if (join_node_in_resolve_process || !resolved_identifier)
+    // if (join_node_in_resolve_process || !resolved_identifier)
+    if (!resolved_identifier)
         return resolved_identifier;
 
     if (scope.join_use_nulls)
diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp
index bb8225af562..77dbad5443e 100644
--- a/src/Core/Block.cpp
+++ b/src/Core/Block.cpp
@@ -10,8 +10,6 @@
 #include <Common/FieldVisitorToString.h>
 #include <Common/assert_cast.h>
 
-#include <Processors/Formats/IOutputFormat.h>
-
 #include <iterator>
 
 #include <boost/algorithm/string.hpp>
@@ -430,16 +428,6 @@ std::string Block::dumpIndex() const
     return out.str();
 }
 
-std::string Block::dumpContent() const
-{
-    WriteBufferFromOwnString buf;
-    auto output_format = Context::getGlobalContextInstance()->getOutputFormat("PrettyCompact", buf, *this);
-    output_format->write(materializeBlock(*this));
-    output_format->flush();
-    buf.finalize();
-    return buf.str();
-}
-
 Block Block::cloneEmpty() const
 {
     Block res;
diff --git a/src/Core/Block.h b/src/Core/Block.h
index 9c054b7aea7..c8bebb4552a 100644
--- a/src/Core/Block.h
+++ b/src/Core/Block.h
@@ -126,9 +126,6 @@ public:
     /** List of column names and positions from index */
     std::string dumpIndex() const;
 
-    /// Print all the values in this block.
-    std::string dumpContent() const;
-
     /** Get the same block, but empty. */
     Block cloneEmpty() const;
 
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 1690add4e2d..616bbee131b 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -321,7 +321,7 @@ class IColumn;
     M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0)    \
     \
     M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
-    M(Bool, enable_mixed_join_condition, false, "Support conditions involve columns from both tables in on join expression.", IMPORTANT) \
+    M(Bool, allow_experimental_join_condition, false, "Support conditions involve columns from both tables in on join expression.", IMPORTANT) \
     \
     M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
     M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index d2edf6ce9bd..b8ced4318d8 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -106,6 +106,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"traverse_shadow_remote_data_paths", false, false, "Traverse shadow directory when query system.remote_data_paths."},
               {"throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert", false, true, "Deduplication is dependent materialized view cannot work together with async inserts."},
               {"parallel_replicas_allow_in_with_subquery", false, true, "If true, subquery for IN will be executed on every follower replica"},
+              {"allow_experimental_join_condition", false, false, "Support conditions involve columns from both tables in on join expression."},
               {"log_processors_profiles", false, true, "Enable by default"},
               {"function_locate_has_mysql_compatible_argument_order", false, true, "Increase compatibility with MySQL's locate function."},
               {"allow_suspicious_primary_key", true, false, "Forbid suspicious PRIMARY KEY/ORDER BY for MergeTree (i.e. SimpleAggregateFunction)"},
diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp
index 1d193b3499c..aa2b677531a 100644
--- a/src/Interpreters/ExpressionActions.cpp
+++ b/src/Interpreters/ExpressionActions.cpp
@@ -18,6 +18,7 @@
 #include <stack>
 #include <base/sort.h>
 #include <Common/JSONBuilder.h>
+#include "ExpressionActions.h"
 #include <Core/SettingsEnums.h>
 
 
@@ -615,12 +616,20 @@ static void executeAction(const ExpressionActions::Action & action, ExecutionCon
 
                 res_column.column = action.node->function->execute(arguments, res_column.type, num_rows, dry_run);
                 if (res_column.column->getDataType() != res_column.type->getColumnType())
+                {
+                    WriteBufferFromOwnString out;
+                    for (const auto & arg : arguments)
+                        out << arg.dumpStructure() << ",";
+
                     throw Exception(
                         ErrorCodes::LOGICAL_ERROR,
-                        "Unexpected return type from {}. Expected {}. Got {}",
+                        "Unexpected return type from {}. Expected {}. Got {}. Action:\n{},\ninput block structure:{}",
                         action.node->function->getName(),
-                        res_column.type->getColumnType(),
-                        res_column.column->getDataType());
+                        res_column.type->getName(), //res_column.type->getColumnType(),
+                        res_column.column->getName(), //res_column.column->getDataType(),
+                        action.toString(),
+                        out.str());
+                }
             }
             break;
         }
diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index 823c5d48db9..938f53c47e8 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -257,7 +257,7 @@ HashJoin::HashJoin(std::shared_ptr<TableJoin> table_join_, const Block & right_s
     LOG_TRACE(log, "{}Keys: {}, datatype: {}, kind: {}, strictness: {}, right header: {}",
         instance_log_id, TableJoin::formatClauses(table_join->getClauses(), true), data->type, kind, strictness, right_sample_block.dumpStructure());
 
-    validateAdditionalFilterExpression(table_join->getFullJoinExpression());
+    validateAdditionalFilterExpression(table_join->getMixedJoinExpression());
 
     if (isCrossOrComma(kind))
     {
@@ -713,7 +713,7 @@ void HashJoin::initRightBlockStructure(Block & saved_block_sample)
                             table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH) ||
                             isRightOrFull(kind) ||
                             multiple_disjuncts ||
-                            table_join->getFullJoinExpression();
+                            table_join->getMixedJoinExpression();
     if (save_key_columns)
     {
         saved_block_sample = right_table_keys.cloneEmpty();
@@ -1506,8 +1506,18 @@ ColumnPtr buildAdditionalFilter(
         return ColumnUInt8::create();
     const Block & sample_right_block = *selected_rows.begin()->block;
     if (!sample_right_block)
-        return ColumnUInt8::create();
+    {
+        auto filter = ColumnUInt8::create();
+        filter->insertMany(1, selected_rows.size());
+        return filter;
+    }
 
+    if (!added_columns.additional_filter_expression)
+    {
+        auto filter = ColumnUInt8::create();
+        filter->insertMany(1, selected_rows.size());
+        return filter;
+    }
     auto required_cols = added_columns.additional_filter_expression->getRequiredColumnsWithTypes();
     if (required_cols.empty())
     {
@@ -1561,9 +1571,29 @@ ColumnPtr buildAdditionalFilter(
         }
         executed_block.insert({std::move(new_col), src_col->type, col_name});
     }
-    LOG_TRACE(getLogger("HashJoin"), "Additional filter execute block:\n{}", executed_block.dumpContent());
+    if (!executed_block)
+    {
+        WriteBufferFromOwnString buf;
+        for (const auto & col : required_cols)
+        {
+            buf << col.name << ", ";
+        }
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "required columns: {}. but not found any in left/right table. right table: {}, left table: {}",
+            buf.str(),
+            sample_right_block.dumpNames(),
+            added_columns.left_block.dumpNames());
+    }
+    // Debug
+    for (const auto & col : executed_block.getColumnsWithTypeAndName())
+    {
+        if (!col.column || !col.type)
+        {
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Null column in input block. {}", executed_block.dumpStructure());
+        }
+    }
     added_columns.additional_filter_expression->execute(executed_block);
-    LOG_TRACE(getLogger("HashJoin"), "Addition filter execute result block:\n{}", executed_block.dumpContent());
     return executed_block.getByPosition(0).column;
 }
 
@@ -1609,20 +1639,20 @@ void addFoundRowRefAll(
 template <
     typename KeyGetter,
     typename Map,
-    bool need_filter,
     bool need_replication,
-    bool need_flags,
-    bool add_missing,
-    bool flag_per_row,
     typename AddedColumns>
 NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
     std::vector<KeyGetter> && key_getter_vector,
     const std::vector<const Map *> & mapv,
     AddedColumns & added_columns,
-    JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]])
+    JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]],
+    bool need_filter [[maybe_unused]],
+    bool need_flags [[maybe_unused]],
+    bool add_missing [[maybe_unused]],
+    bool flag_per_row [[maybe_unused]])
 {
     size_t left_block_rows = added_columns.rows_to_add;
-    if constexpr (need_filter)
+    if (need_filter)
         added_columns.filter = IColumn::Filter(left_block_rows, 0);
 
     std::unique_ptr<Arena> pool;
@@ -1661,7 +1691,8 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                     break;
                 }
             }
-            KnownRowsHolder<flag_per_row> known_rows;
+            KnownRowsHolder<true> all_flag_known_rows;
+            KnownRowsHolder<false> single_flag_know_rows;
             for (size_t join_clause_idx = 0; join_clause_idx < added_columns.join_on_keys.size(); ++join_clause_idx)
             {
                 const auto & join_keys = added_columns.join_on_keys[join_clause_idx];
@@ -1677,7 +1708,10 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                 {
                     auto & mapped = find_result.getMapped();
                     find_results.push_back(find_result);
-                    addFoundRowRefAll<flag_per_row>(mapped, selected_rows, current_added_rows, known_rows);
+                    if (flag_per_row)
+                        addFoundRowRefAll<true>(mapped, selected_rows, current_added_rows, all_flag_known_rows);
+                    else
+                        addFoundRowRefAll<false>(mapped, selected_rows, current_added_rows, single_flag_know_rows);
                 }
             }
             row_replicate_offset.push_back(current_added_rows);
@@ -1705,7 +1739,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
         {
             bool any_matched = false;
             /// For all right join, flag_per_row is true, we need mark used flags for each row.
-            if constexpr (flag_per_row)
+            if (flag_per_row)
             {
                 for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
                 {
@@ -1714,7 +1748,8 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                         any_matched = true;
                         added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing);
                         total_added_rows += 1;
-                        used_flags.template setUsed<need_flags, flag_per_row>(selected_right_row_it->block, selected_right_row_it->row_num, 0);
+                        if (need_flags)
+                            used_flags.template setUsed<true, true>(selected_right_row_it->block, selected_right_row_it->row_num, 0);
                     }
                     ++selected_right_row_it;
                 }
@@ -1734,14 +1769,18 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
             }
             if (!any_matched)
             {
-                addNotFoundRow<add_missing, need_replication>(added_columns, total_added_rows);
+                if (add_missing)
+                    addNotFoundRow<true, need_replication>(added_columns, total_added_rows);
+                else
+                    addNotFoundRow<false, need_replication>(added_columns, total_added_rows);
             }
             else
             {
-                if constexpr (!flag_per_row)
-                    used_flags.template setUsed<need_flags, false>(find_results[find_result_index]);
-                setUsed<need_filter>(added_columns.filter, left_start_row + i - 1);
-                if constexpr (add_missing)
+                if (!flag_per_row && need_flags)
+                    used_flags.template setUsed<true, false>(find_results[find_result_index]);
+                if (need_filter)
+                    setUsed<true>(added_columns.filter, left_start_row + i - 1);
+                if (add_missing)
                     added_columns.applyLazyDefaults();
             }
             find_result_index += (prev_replicated_row != row_replicate_offset[i]);
@@ -1954,23 +1993,24 @@ size_t joinRightColumnsSwitchMultipleDisjuncts(
         if (added_columns.additional_filter_expression)
         {
            constexpr bool mark_per_row_used = join_features.right || join_features.full;
-           return mapv.size() > 1
-               ? joinRightColumnsWithAddtitionalFilter<
-                   KeyGetter,
-                   Map,
-                   need_filter,
-                   join_features.need_replication,
-                   join_features.need_flags,
-                   join_features.add_missing,
-                   true>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
-               : joinRightColumnsWithAddtitionalFilter<
-                   KeyGetter,
-                   Map,
-                   need_filter,
-                   join_features.need_replication,
-                   join_features.need_flags,
-                   join_features.add_missing,
-                   mark_per_row_used>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
+           return mapv.size() > 1 ? joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
+                      std::forward<std::vector<KeyGetter>>(key_getter_vector),
+                      mapv,
+                      added_columns,
+                      used_flags,
+                      need_filter,
+                      join_features.need_flags,
+                      join_features.add_missing,
+                      true)
+                                  : joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
+                                      std::forward<std::vector<KeyGetter>>(key_getter_vector),
+                                      mapv,
+                                      added_columns,
+                                      used_flags,
+                                      need_filter,
+                                      join_features.need_flags,
+                                      join_features.add_missing,
+                                      mark_per_row_used);
         }
         else
         {
@@ -2141,7 +2181,7 @@ Block HashJoin::joinBlockImpl(
         savedBlockSample(),
         *this,
         std::move(join_on_keys),
-        table_join->getFullJoinExpression(),
+        table_join->getMixedJoinExpression(),
         join_features.is_asof_join,
         is_join_get);
 
@@ -2788,7 +2828,7 @@ bool HashJoin::needUsedFlagsForPerRightTableRow(std::shared_ptr<TableJoin> table
     if (!table_join_->oneDisjunct())
         return true;
     /// If it'a a all right join with inequal conditions, we need to mark each row
-    if (table_join_->getFullJoinExpression() && isRightOrFull(table_join_->kind()))
+    if (table_join_->getMixedJoinExpression() && isRightOrFull(table_join_->kind()))
         return true;
     return false;
 }
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 191e4fb1908..1d65330da61 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -155,7 +155,7 @@ private:
 
     Clauses clauses;
     /// Originally used for inequal join. If there is no any inequal join condition, it will be nullptr.
-    std::shared_ptr<ExpressionActions> full_join_expression = nullptr;
+    std::shared_ptr<ExpressionActions> mixed_join_expression = nullptr;
 
     ASTTableJoin table_join;
 
@@ -301,8 +301,8 @@ public:
     std::vector<JoinOnClause> & getClauses() { return clauses; }
     const std::vector<JoinOnClause> & getClauses() const { return clauses; }
 
-    const std::shared_ptr<ExpressionActions> & getFullJoinExpression() const { return full_join_expression; }
-    std::shared_ptr<ExpressionActions> & getFullJoinExpression() { return full_join_expression; }
+    const std::shared_ptr<ExpressionActions> & getMixedJoinExpression() const { return mixed_join_expression; }
+    std::shared_ptr<ExpressionActions> & getMixedJoinExpression() { return mixed_join_expression; }
 
     Names getAllNames(JoinTableSide side) const;
 
diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp
index 7b21cb4bef4..81ba47596c8 100644
--- a/src/Planner/PlannerJoinTree.cpp
+++ b/src/Planner/PlannerJoinTree.cpp
@@ -1042,6 +1042,31 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres
     };
 }
 
+static ColumnsWithTypeAndName joinCastPlanColumnsToNullable(const ColumnsWithTypeAndName & cols, PlannerContextPtr & planner_context)
+{
+    ColumnsWithTypeAndName res;
+    for (const auto & col : cols)
+    {
+        if (planner_context->getGlobalPlannerContext()->hasColumnIdentifier(col.name))
+        {
+            DataTypePtr type_to_check = col.type;
+            if (const auto * type_to_check_low_cardinality = typeid_cast<const DataTypeLowCardinality *>(type_to_check.get()))
+                type_to_check = type_to_check_low_cardinality->getDictionaryType();
+
+            if (type_to_check->canBeInsideNullable())
+            {
+                type_to_check = makeNullable(type_to_check);
+            }
+            res.push_back(ColumnWithTypeAndName(type_to_check->createColumn(), type_to_check, col.name));
+        }
+        else
+        {
+            res.push_back(col);
+        }
+    }
+    return res;
+}
+
 void joinCastPlanColumnsToNullable(QueryPlan & plan_to_add_cast, PlannerContextPtr & planner_context, const FunctionOverloadResolverPtr & to_nullable_function)
 {
     auto cast_actions_dag = std::make_shared<ActionsDAG>(plan_to_add_cast.getCurrentDataStream().header.getColumnsWithTypeAndName());
@@ -1112,6 +1137,22 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_
 
     if (!join_constant && join_node.isOnJoinExpression())
     {
+        if (planner_context->getQueryContext()->getSettingsRef().join_use_nulls)
+        {
+            if (join_kind == JoinKind::Full)
+            {
+                left_plan_output_columns = joinCastPlanColumnsToNullable(left_plan_output_columns, planner_context);
+                right_plan_output_columns = joinCastPlanColumnsToNullable(right_plan_output_columns, planner_context);
+            }
+            else if (join_kind == JoinKind::Left)
+            {
+                right_plan_output_columns = joinCastPlanColumnsToNullable(right_plan_output_columns, planner_context);
+            }
+            else if (join_kind == JoinKind::Right)
+            {
+                left_plan_output_columns = joinCastPlanColumnsToNullable(left_plan_output_columns, planner_context);
+            }
+        }
         join_clauses_and_actions = buildJoinClausesAndActions(left_plan_output_columns,
             right_plan_output_columns,
             join_table_expression,
@@ -1318,11 +1359,11 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_
             }
         }
 
-        if (join_clauses_and_actions.full_join_expressions_actions)
+        if (join_clauses_and_actions.mixed_join_expressions_actions)
         {
-            ExpressionActionsPtr & full_join_expression = table_join->getFullJoinExpression();
-            full_join_expression = std::make_shared<ExpressionActions>(
-                join_clauses_and_actions.full_join_expressions_actions,
+            ExpressionActionsPtr & mixed_join_expression = table_join->getMixedJoinExpression();
+            mixed_join_expression = std::make_shared<ExpressionActions>(
+                join_clauses_and_actions.mixed_join_expressions_actions,
                 ExpressionActionsSettings::fromContext(planner_context->getQueryContext()));
         }
     }
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 8fc1a187701..6078df01506 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -314,7 +314,7 @@ void buildJoinClause(
         }
         else
         {
-            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().enable_mixed_join_condition;
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_experimental_join_condition;
             if (support_mixed_join_condition)
             {
                 /// expression involves both tables.
@@ -348,7 +348,7 @@ void buildJoinClause(
         }
         else
         {
-            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().enable_mixed_join_condition;
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_experimental_join_condition;
             if (support_mixed_join_condition)
             {
                 /// expression involves both tables.
@@ -596,29 +596,29 @@ JoinClausesAndActions buildJoinClausesAndActions(
     {
         if (result.join_clauses.size() > 1)
         {
-            auto full_join_expressions_actions = std::make_shared<ActionsDAG>(mixed_table_expression_columns);
+            auto mixed_join_expressions_actions = std::make_shared<ActionsDAG>(mixed_table_expression_columns);
             PlannerActionsVisitor join_expression_visitor(planner_context);
-            auto join_expression_dag_node_raw_pointers = join_expression_visitor.visit(full_join_expressions_actions, join_expression);
+            auto join_expression_dag_node_raw_pointers = join_expression_visitor.visit(mixed_join_expressions_actions, join_expression);
             if (join_expression_dag_node_raw_pointers.size() != 1)
                 throw Exception(
                     ErrorCodes::LOGICAL_ERROR, "JOIN {} ON clause contains multiple expressions", join_node.formatASTForErrorMessage());
 
-            full_join_expressions_actions->addOrReplaceInOutputs(*join_expression_dag_node_raw_pointers[0]);
+            mixed_join_expressions_actions->addOrReplaceInOutputs(*join_expression_dag_node_raw_pointers[0]);
             Names required_names{join_expression_dag_node_raw_pointers[0]->result_name};
-            full_join_expressions_actions->removeUnusedActions(required_names);
-            result.full_join_expressions_actions = full_join_expressions_actions;
+            mixed_join_expressions_actions->removeUnusedActions(required_names);
+            result.mixed_join_expressions_actions = mixed_join_expressions_actions;
         }
         else
         {
             const auto & join_clause = result.join_clauses.front();
             const auto & mixed_filter_condition_nodes = join_clause.getMixedFilterConditionNodes();
-            auto full_join_expressions_actions = ActionsDAG::buildFilterActionsDAG(mixed_filter_condition_nodes, {}, true);
-            result.full_join_expressions_actions = full_join_expressions_actions;
+            auto mixed_join_expressions_actions = ActionsDAG::buildFilterActionsDAG(mixed_filter_condition_nodes, {}, true);
+            result.mixed_join_expressions_actions = mixed_join_expressions_actions;
         }
-        auto outputs = result.full_join_expressions_actions->getOutputs();
+        auto outputs = result.mixed_join_expressions_actions->getOutputs();
         if (outputs.size() != 1)
         {
-            throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, "Only one output is expected. but got:\n{}", result.full_join_expressions_actions->dumpDAG());
+            throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, "Only one output is expected. but got:\n{}", result.mixed_join_expressions_actions->dumpDAG());
         }
         auto output_type = removeNullable(outputs[0]->result_type);
         WhichDataType which_type(output_type);
@@ -626,8 +626,8 @@ JoinClausesAndActions buildJoinClausesAndActions(
         {
             DataTypePtr uint8_ty = std::make_shared<DataTypeUInt8>();
             auto true_col = ColumnWithTypeAndName(uint8_ty->createColumnConst(1, 1), uint8_ty, "true");
-            const auto * true_node = &result.full_join_expressions_actions->addColumn(true_col);
-            result.full_join_expressions_actions = ActionsDAG::buildFilterActionsDAG({outputs[0], true_node});
+            const auto * true_node = &result.mixed_join_expressions_actions->addColumn(true_col);
+            result.mixed_join_expressions_actions = ActionsDAG::buildFilterActionsDAG({outputs[0], true_node});
         }
     }
 
@@ -842,8 +842,8 @@ std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> & table_jo
     const Block & right_table_expression_header,
     const PlannerContextPtr & planner_context)
 {
-    if (table_join->getFullJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH)
-        && !(table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH) && table_join->oneDisjunct()))
+    if (table_join->getMixedJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH)
+        && !table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH))
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED,
             "JOIN with mixed conditions supports only hash join or grace hash join with one disjunct.");
diff --git a/src/Planner/PlannerJoins.h b/src/Planner/PlannerJoins.h
index 41736047a14..c96941a3c16 100644
--- a/src/Planner/PlannerJoins.h
+++ b/src/Planner/PlannerJoins.h
@@ -194,7 +194,7 @@ struct JoinClausesAndActions
     ActionsDAGPtr right_join_expressions_actions;
     /// Originally used for inequal join. it's the total join expression.
     /// If there is no inequal join conditions, it's null.
-    ActionsDAGPtr full_join_expressions_actions;
+    ActionsDAGPtr mixed_join_expressions_actions;
 };
 
 /** Calculate join clauses and actions for JOIN ON section.
diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
index a85b7558b21..6084e6dbc76 100644
--- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
+++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
@@ -7,7 +7,7 @@ CREATE TABLE t2 (key String, attr String, a UInt64, b UInt64, c Nullable(UInt64)
 INSERT INTO t2 VALUES ('key1', 'A', 1, 2, 1), ('key1', 'B', 2, 1, 2), ('key1', 'C', 3, 4, 5), ('key1', 'D', 4, 1, 6), ('key3', 'a3', 1, 1, 1), ('key4', 'F', 1,1,1);
 
 SET allow_experimental_analyzer=1;
-SET enable_mixed_join_condition=1;
+SET allow_experimental_join_condition=1;
 -- { echoOn }
 {% for algorithm in ['hash', 'grace_hash'] -%}
 SET join_algorithm='{{ algorithm }}';
diff --git a/utils/check-style/check-large-objects.sh b/utils/check-style/check-large-objects.sh
index a1e05c6ad05..2122cca911e 100755
--- a/utils/check-style/check-large-objects.sh
+++ b/utils/check-style/check-large-objects.sh
@@ -9,7 +9,7 @@ TU_EXCLUDES=(
     Aggregator
 )
 
-if find $1 -name '*.o' | xargs wc -c | grep --regexp='\.o$' | sort -rn | awk '{ if ($1 > 100000000) print }' \
+if find $1 -name '*.o' | xargs wc -c | grep --regexp='\.o$' | sort -rn | awk '{ if ($1 > 50000000) print }' \
     | grep -v -f <(printf "%s\n" "${TU_EXCLUDES[@]}")
 then
     echo "^ It's not allowed to have so large translation units."

From 6c30774ce580351ffeb9985ea23dffdfc6d192e7 Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Wed, 17 Apr 2024 08:22:43 +0800
Subject: [PATCH 072/192] disable when join_use_nulls=1

update setting

update
---
 src/Analyzer/Passes/QueryAnalysisPass.cpp     |  3 +-
 src/Core/Settings.h                           |  2 +-
 src/Core/SettingsChangesHistory.h             |  2 +-
 src/Planner/PlannerJoinTree.cpp               | 41 -------------------
 src/Planner/PlannerJoins.cpp                  | 12 ++++--
 ...006_join_on_inequal_expression_fast.sql.j2 |  3 +-
 6 files changed, 13 insertions(+), 50 deletions(-)

diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp
index f932a0de006..a3c60c76132 100644
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@@ -3651,8 +3651,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromJoin(const IdentifierLoo
         }
     }
 
-    // if (join_node_in_resolve_process || !resolved_identifier)
-    if (!resolved_identifier)
+    if (join_node_in_resolve_process || !resolved_identifier)
         return resolved_identifier;
 
     if (scope.join_use_nulls)
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 616bbee131b..6618e595974 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -321,7 +321,7 @@ class IColumn;
     M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0)    \
     \
     M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
-    M(Bool, allow_experimental_join_condition, false, "Support conditions involve columns from both tables in on join expression.", IMPORTANT) \
+    M(Bool, allow_mixed_join_condition, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.", IMPORTANT) \
     \
     M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
     M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index b8ced4318d8..397ada59986 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -90,6 +90,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"lightweight_deletes_sync", 2, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes"},
               {"query_cache_system_table_handling", "save", "throw", "The query cache no longer caches results of queries against system tables"},
               {"input_format_json_ignore_unnecessary_fields", false, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields"},
+              {"allow_mixed_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."},
               {"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
               {"first_day_of_week", "Monday", "Monday", "Added a setting for the first day of the week for date/time functions"},
               {"allow_experimental_database_replicated", false, true, "Database engine Replicated is now in Beta stage"},
@@ -106,7 +107,6 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"traverse_shadow_remote_data_paths", false, false, "Traverse shadow directory when query system.remote_data_paths."},
               {"throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert", false, true, "Deduplication is dependent materialized view cannot work together with async inserts."},
               {"parallel_replicas_allow_in_with_subquery", false, true, "If true, subquery for IN will be executed on every follower replica"},
-              {"allow_experimental_join_condition", false, false, "Support conditions involve columns from both tables in on join expression."},
               {"log_processors_profiles", false, true, "Enable by default"},
               {"function_locate_has_mysql_compatible_argument_order", false, true, "Increase compatibility with MySQL's locate function."},
               {"allow_suspicious_primary_key", true, false, "Forbid suspicious PRIMARY KEY/ORDER BY for MergeTree (i.e. SimpleAggregateFunction)"},
diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp
index 81ba47596c8..4e97a3125d4 100644
--- a/src/Planner/PlannerJoinTree.cpp
+++ b/src/Planner/PlannerJoinTree.cpp
@@ -1042,31 +1042,6 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres
     };
 }
 
-static ColumnsWithTypeAndName joinCastPlanColumnsToNullable(const ColumnsWithTypeAndName & cols, PlannerContextPtr & planner_context)
-{
-    ColumnsWithTypeAndName res;
-    for (const auto & col : cols)
-    {
-        if (planner_context->getGlobalPlannerContext()->hasColumnIdentifier(col.name))
-        {
-            DataTypePtr type_to_check = col.type;
-            if (const auto * type_to_check_low_cardinality = typeid_cast<const DataTypeLowCardinality *>(type_to_check.get()))
-                type_to_check = type_to_check_low_cardinality->getDictionaryType();
-
-            if (type_to_check->canBeInsideNullable())
-            {
-                type_to_check = makeNullable(type_to_check);
-            }
-            res.push_back(ColumnWithTypeAndName(type_to_check->createColumn(), type_to_check, col.name));
-        }
-        else
-        {
-            res.push_back(col);
-        }
-    }
-    return res;
-}
-
 void joinCastPlanColumnsToNullable(QueryPlan & plan_to_add_cast, PlannerContextPtr & planner_context, const FunctionOverloadResolverPtr & to_nullable_function)
 {
     auto cast_actions_dag = std::make_shared<ActionsDAG>(plan_to_add_cast.getCurrentDataStream().header.getColumnsWithTypeAndName());
@@ -1137,22 +1112,6 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_
 
     if (!join_constant && join_node.isOnJoinExpression())
     {
-        if (planner_context->getQueryContext()->getSettingsRef().join_use_nulls)
-        {
-            if (join_kind == JoinKind::Full)
-            {
-                left_plan_output_columns = joinCastPlanColumnsToNullable(left_plan_output_columns, planner_context);
-                right_plan_output_columns = joinCastPlanColumnsToNullable(right_plan_output_columns, planner_context);
-            }
-            else if (join_kind == JoinKind::Left)
-            {
-                right_plan_output_columns = joinCastPlanColumnsToNullable(right_plan_output_columns, planner_context);
-            }
-            else if (join_kind == JoinKind::Right)
-            {
-                left_plan_output_columns = joinCastPlanColumnsToNullable(left_plan_output_columns, planner_context);
-            }
-        }
         join_clauses_and_actions = buildJoinClausesAndActions(left_plan_output_columns,
             right_plan_output_columns,
             join_table_expression,
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 6078df01506..197edde97be 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -314,8 +314,10 @@ void buildJoinClause(
         }
         else
         {
-            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_experimental_join_condition;
-            if (support_mixed_join_condition)
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_mixed_join_condition;
+            auto join_use_nulls = planner_context->getQueryContext()->getSettingsRef().join_use_nulls;
+            /// If join_use_nulls = true, the columns' nullability will be changed later which make this expression not right.
+            if (support_mixed_join_condition && !join_use_nulls)
             {
                 /// expression involves both tables.
                 /// `expr1(left.col1, right.col2) == expr2(left.col3, right.col4)`
@@ -348,8 +350,10 @@ void buildJoinClause(
         }
         else
         {
-            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_experimental_join_condition;
-            if (support_mixed_join_condition)
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_mixed_join_condition;
+            auto join_use_nulls = planner_context->getQueryContext()->getSettingsRef().join_use_nulls;
+            /// If join_use_nulls = true, the columns' nullability will be changed later which make this expression not right.
+            if (support_mixed_join_condition && !join_use_nulls)
             {
                 /// expression involves both tables.
                 const auto * node = appendExpression(mixed_dag, join_expression, planner_context, join_node);
diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
index 6084e6dbc76..89df825b32b 100644
--- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
+++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
@@ -7,7 +7,8 @@ CREATE TABLE t2 (key String, attr String, a UInt64, b UInt64, c Nullable(UInt64)
 INSERT INTO t2 VALUES ('key1', 'A', 1, 2, 1), ('key1', 'B', 2, 1, 2), ('key1', 'C', 3, 4, 5), ('key1', 'D', 4, 1, 6), ('key3', 'a3', 1, 1, 1), ('key4', 'F', 1,1,1);
 
 SET allow_experimental_analyzer=1;
-SET allow_experimental_join_condition=1;
+SET allow_mixed_join_condition=1;
+SET join_use_nulls=0;
 -- { echoOn }
 {% for algorithm in ['hash', 'grace_hash'] -%}
 SET join_algorithm='{{ algorithm }}';

From 7ee720ffb00c8626215cab8acb21556f4db74b1e Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 25 Apr 2024 14:43:05 +0800
Subject: [PATCH 073/192] update settting

---
 src/Core/Settings.h                         | 2 +-
 src/Core/SettingsChangesHistory.h           | 2 +-
 src/Interpreters/CollectJoinOnKeysVisitor.h | 1 +
 src/Planner/PlannerJoins.cpp                | 4 ++--
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 6618e595974..6454d60c15e 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -321,7 +321,7 @@ class IColumn;
     M(Bool, fsync_metadata, true, "Do fsync after changing metadata for tables and databases (.sql files). Could be disabled in case of poor latency on server with high load of DDL queries and high load of disk subsystem.", 0)    \
     \
     M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
-    M(Bool, allow_mixed_join_condition, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.", IMPORTANT) \
+    M(Bool, allow_experimental_join_condition, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.", IMPORTANT) \
     \
     M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
     M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index 397ada59986..23192ba7b21 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -90,12 +90,12 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"lightweight_deletes_sync", 2, 2, "The same as 'mutation_sync', but controls only execution of lightweight deletes"},
               {"query_cache_system_table_handling", "save", "throw", "The query cache no longer caches results of queries against system tables"},
               {"input_format_json_ignore_unnecessary_fields", false, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields"},
-              {"allow_mixed_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."},
               {"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
               {"first_day_of_week", "Monday", "Monday", "Added a setting for the first day of the week for date/time functions"},
               {"allow_experimental_database_replicated", false, true, "Database engine Replicated is now in Beta stage"},
               {"temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds", (10 * 60 * 1000), (10 * 60 * 1000), "Wait time to lock cache for sapce reservation in temporary data in filesystem cache"},
               {"azure_allow_parallel_part_upload", "true", "true", "Use multiple threads for azure multipart upload."},
+              {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."},
               }},
     {"24.3", {{"s3_connect_timeout_ms", 1000, 1000, "Introduce new dedicated setting for s3 connection timeout"},
               {"allow_experimental_shared_merge_tree", false, true, "The setting is obsolete"},
diff --git a/src/Interpreters/CollectJoinOnKeysVisitor.h b/src/Interpreters/CollectJoinOnKeysVisitor.h
index 7218f96d781..ff10db3beb5 100644
--- a/src/Interpreters/CollectJoinOnKeysVisitor.h
+++ b/src/Interpreters/CollectJoinOnKeysVisitor.h
@@ -8,6 +8,7 @@
 #include <Parsers/ASTFunction.h>
 #include <Parsers/queryToString.h>
 
+
 namespace DB
 {
 
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 197edde97be..930881ec7d1 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -314,7 +314,7 @@ void buildJoinClause(
         }
         else
         {
-            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_mixed_join_condition;
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_experimental_join_condition;
             auto join_use_nulls = planner_context->getQueryContext()->getSettingsRef().join_use_nulls;
             /// If join_use_nulls = true, the columns' nullability will be changed later which make this expression not right.
             if (support_mixed_join_condition && !join_use_nulls)
@@ -350,7 +350,7 @@ void buildJoinClause(
         }
         else
         {
-            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_mixed_join_condition;
+            auto support_mixed_join_condition = planner_context->getQueryContext()->getSettingsRef().allow_experimental_join_condition;
             auto join_use_nulls = planner_context->getQueryContext()->getSettingsRef().join_use_nulls;
             /// If join_use_nulls = true, the columns' nullability will be changed later which make this expression not right.
             if (support_mixed_join_condition && !join_use_nulls)

From 6fa6c0261b41ce63091b9ef353de14823469c5a8 Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 25 Apr 2024 16:20:27 +0800
Subject: [PATCH 074/192] apply 27a2b19

---
 .../sql-reference/statements/select/join.md   |  23 ++-
 src/Interpreters/ExpressionActions.cpp        |  11 +-
 src/Interpreters/HashJoin.cpp                 | 164 +++++++-----------
 src/Planner/PlannerJoins.cpp                  |  14 +-
 src/Planner/PlannerJoins.h                    |   4 -
 ...006_join_on_inequal_expression_fast.sql.j2 |   2 +-
 6 files changed, 96 insertions(+), 122 deletions(-)

diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md
index 141bb80ceb4..4ef407a4d13 100644
--- a/docs/en/sql-reference/statements/select/join.md
+++ b/docs/en/sql-reference/statements/select/join.md
@@ -164,12 +164,26 @@ Result:
 │ 4 │ -4 │   4 │
 └───┴────┴─────┘
 ```
-## Join with inequality conditions
-Clickhouse currently supports inner, left, right and full join with inequality conditions, including with `OR` operator. You need to set `allow_experimental_analyzer = 1` and select `hash` or `grace_hash` join algorithm.
+
+## [experimental] Join with inequality conditions
+
+:::note
+This feature is experimental. To use it, set `allow_experimental_join_condition` to 1 in your configuration files or by using the `SET` command:
+
+```sql
+SET allow_experimental_join_condition=1
+```
+
+Otherwise, you'll get `INVALID_JOIN_ON_EXPRESSION`.
+
+:::
+
+Clickhouse currently supports `ALL INNER/LEFT/RIGHT/FULL JOIN` with inequality conditions in addition to equality conditions. The inequality conditions are supported only for `hash` and `grace_hash` join algorithms. The inequality conditions are not supported with `join_use_nulls`.
 
 **Example**
 
 Table `t1`:
+
 ```
 ┌─key──┬─attr─┬─a─┬─b─┬─c─┐
 │ key1 │ a    │ 1 │ 1 │ 2 │
@@ -183,6 +197,7 @@ Table `t1`:
 ```
 
 Table `t2`
+
 ```
 ┌─key──┬─attr─┬─a─┬─b─┬─c─┐
 │ key1 │ A    │ 1 │ 2 │ 1 │
@@ -193,9 +208,11 @@ Table `t2`
 │ key4 │ F    │ 1 │ 1 │ 1 │
 └──────┴──────┴───┴───┴───┘
 ```
+
 ```sql
 SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr);
 ```
+
 ```
 key1	a	1	1	2	key1	B	2	1	2
 key1	a	1	1	2	key1	C	3	4	5
@@ -318,7 +335,7 @@ For example, consider the following tables:
 ## PASTE JOIN Usage
 
 The result of `PASTE JOIN` is a table that contains all columns from left subquery followed by all columns from the right subquery.
-The rows are matched based on their positions in the original tables (the order of rows should be defined). 
+The rows are matched based on their positions in the original tables (the order of rows should be defined).
 If the subqueries return a different number of rows, extra rows will be cut.
 
 Example:
diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp
index aa2b677531a..04f29f35c3c 100644
--- a/src/Interpreters/ExpressionActions.cpp
+++ b/src/Interpreters/ExpressionActions.cpp
@@ -18,7 +18,6 @@
 #include <stack>
 #include <base/sort.h>
 #include <Common/JSONBuilder.h>
-#include "ExpressionActions.h"
 #include <Core/SettingsEnums.h>
 
 
@@ -617,18 +616,14 @@ static void executeAction(const ExpressionActions::Action & action, ExecutionCon
                 res_column.column = action.node->function->execute(arguments, res_column.type, num_rows, dry_run);
                 if (res_column.column->getDataType() != res_column.type->getColumnType())
                 {
-                    WriteBufferFromOwnString out;
-                    for (const auto & arg : arguments)
-                        out << arg.dumpStructure() << ",";
-
                     throw Exception(
                         ErrorCodes::LOGICAL_ERROR,
                         "Unexpected return type from {}. Expected {}. Got {}. Action:\n{},\ninput block structure:{}",
                         action.node->function->getName(),
-                        res_column.type->getName(), //res_column.type->getColumnType(),
-                        res_column.column->getName(), //res_column.column->getDataType(),
+                        res_column.type->getName(),
+                        res_column.column->getName(),
                         action.toString(),
-                        out.str());
+                        Block(arguments).dumpStructure());
                 }
             }
             break;
diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index 938f53c47e8..56955066191 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -1573,67 +1573,58 @@ ColumnPtr buildAdditionalFilter(
     }
     if (!executed_block)
     {
-        WriteBufferFromOwnString buf;
-        for (const auto & col : required_cols)
-        {
-            buf << col.name << ", ";
-        }
         throw Exception(
             ErrorCodes::LOGICAL_ERROR,
-            "required columns: {}. but not found any in left/right table. right table: {}, left table: {}",
-            buf.str(),
+            "required columns: [{}], but not found any in left/right table. right table: {}, left table: {}",
+            required_cols.toString(),
             sample_right_block.dumpNames(),
             added_columns.left_block.dumpNames());
     }
-    // Debug
+
     for (const auto & col : executed_block.getColumnsWithTypeAndName())
     {
         if (!col.column || !col.type)
-        {
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Null column in input block. {}", executed_block.dumpStructure());
-        }
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Illegal nullptr column in input block: {}", executed_block.dumpStructure());
     }
+
     added_columns.additional_filter_expression->execute(executed_block);
-    return executed_block.getByPosition(0).column;
+
+    ColumnPtr result_column = executed_block.getByPosition(0).column->convertToFullColumnIfConst();
+    executed_block.clear();
+
+    if (result_column->isNullable())
+    {
+        /// Convert Nullable(UInt8) to UInt8 ensuring that nulls are zeros
+        /// Trying to avoid copying data, since we are the only owner of the column.
+        ColumnPtr mask_column = assert_cast<const ColumnNullable &>(*result_column).getNullMapColumnPtr();
+
+        MutableColumnPtr mutable_column;
+        {
+            ColumnPtr nested_column = assert_cast<const ColumnNullable &>(*result_column).getNestedColumnPtr();
+            result_column.reset();
+            mutable_column = IColumn::mutate(std::move(nested_column));
+        }
+
+        auto & column_data = assert_cast<ColumnUInt8 &>(*mutable_column).getData();
+        const auto & mask_column_data = assert_cast<const ColumnUInt8 &>(*mask_column).getData();
+        for (size_t i = 0; i < column_data.size(); ++i)
+        {
+            if (mask_column_data[i])
+                column_data[i] = 0;
+        }
+        return mutable_column;
+    }
+    return result_column;
 }
 
-template <bool flag_per_row>
-void addFoundRowRefAll(
-    const RowRefList & row_list,
-    std::vector<RowRef> & selected_rows,
-    IColumn::Offset & current_offset,
-    KnownRowsHolder<flag_per_row> & known_rows [[maybe_unused]])
+/// Adapter class to pass into addFoundRowAll
+/// In joinRightColumnsWithAdditionalFilter we don't want to add rows directly into AddedColumns,
+/// because they need to be filtered by additional_filter_expression.
+class PreSelectedRows : public std::vector<RowRef>
 {
-    if constexpr (flag_per_row)
-    {
-        std::unique_ptr<std::vector<KnownRowsHolder<true>::Type>> new_known_rows_ptr;
-        for (auto it = row_list.begin(); it.ok(); ++it)
-        {
-            auto row_ref = std::make_pair(it->block, it->row_num);
-            if (!known_rows.isKnown(row_ref))
-            {
-                selected_rows.emplace_back(row_ref.first, row_ref.second);
-                ++current_offset;
-                if (!new_known_rows_ptr)
-                {
-                    new_known_rows_ptr = std::make_unique<std::vector<KnownRowsHolder<true>::Type>>();
-                }
-                new_known_rows_ptr->push_back(row_ref);
-            }
-        }
-
-        if (new_known_rows_ptr)
-            known_rows.add(std::cbegin(*new_known_rows_ptr), std::cend(*new_known_rows_ptr));
-    }
-    else
-    {
-        for (auto it = row_list.begin(); it.ok(); ++it)
-        {
-            selected_rows.emplace_back(it->block, it->row_num);
-            ++current_offset;
-        }
-    }
-}
+public:
+    void appendFromBlock(const Block & block, size_t row_num, bool /* has_default */) { this->emplace_back(&block, row_num); }
+};
 
 /// First to collect all matched rows refs by join keys, then filter out rows which are not true in additional filter expression.
 template <
@@ -1666,7 +1657,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
     using FindResult = typename KeyGetter::FindResult;
     size_t max_joined_block_rows = added_columns.max_joined_block_rows;
     size_t left_row_iter = 0;
-    std::vector<RowRef> selected_rows;
+    PreSelectedRows selected_rows;
     selected_rows.reserve(left_block_rows);
     std::vector<FindResult> find_results;
     find_results.reserve(left_block_rows);
@@ -1709,9 +1700,9 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                     auto & mapped = find_result.getMapped();
                     find_results.push_back(find_result);
                     if (flag_per_row)
-                        addFoundRowRefAll<true>(mapped, selected_rows, current_added_rows, all_flag_known_rows);
+                        addFoundRowAll<Map, false, true>(mapped, selected_rows, current_added_rows, all_flag_known_rows, nullptr);
                     else
-                        addFoundRowRefAll<false>(mapped, selected_rows, current_added_rows, single_flag_know_rows);
+                        addFoundRowAll<Map, false, false>(mapped, selected_rows, current_added_rows, single_flag_know_rows, nullptr);
                 }
             }
             row_replicate_offset.push_back(current_added_rows);
@@ -1720,17 +1711,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
 
     auto copy_final_matched_rows = [&](size_t left_start_row, ColumnPtr filter_col)
     {
-        const PaddedPODArray<UInt8> * filter_flags = nullptr;
-        filter_col = filter_col->convertToFullIfNeeded();
-        if (filter_col->isNullable())
-        {
-            auto nested_col = typeid_cast<const ColumnNullable &>(*filter_col).getNestedColumnPtr();
-            filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*nested_col).getData());
-        }
-        else
-        {
-            filter_flags = &(dynamic_cast<const ColumnUInt8 &>(*filter_col).getData());
-        }
+        const PaddedPODArray<UInt8> & filter_flags = assert_cast<const ColumnUInt8 &>(*filter_col).getData();
 
         size_t prev_replicated_row = 0;
         auto selected_right_row_it = selected_rows.begin();
@@ -1743,7 +1724,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
             {
                 for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
                 {
-                    if ((*filter_flags)[replicated_row])
+                    if (filter_flags[replicated_row])
                     {
                         any_matched = true;
                         added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing);
@@ -1758,7 +1739,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
             {
                 for (size_t replicated_row = prev_replicated_row; replicated_row < row_replicate_offset[i]; ++replicated_row)
                 {
-                    if ((*filter_flags)[replicated_row])
+                    if (filter_flags[replicated_row])
                     {
                         any_matched = true;
                         added_columns.appendFromBlock(*selected_right_row_it->block, selected_right_row_it->row_num, add_missing);
@@ -1979,48 +1960,30 @@ size_t joinRightColumnsSwitchMultipleDisjuncts(
     AddedColumns & added_columns,
     JoinStuff::JoinUsedFlags & used_flags [[maybe_unused]])
 {
-    auto join_without_additional_filter = [&]()
-    {
-        return mapv.size() > 1 ? joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(
-                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
-                               : joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(
-                                   std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
-    };
-
     constexpr JoinFeatures<KIND, STRICTNESS> join_features;
     if constexpr (join_features.is_all_join)
     {
         if (added_columns.additional_filter_expression)
         {
-           constexpr bool mark_per_row_used = join_features.right || join_features.full;
-           return mapv.size() > 1 ? joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
-                      std::forward<std::vector<KeyGetter>>(key_getter_vector),
-                      mapv,
-                      added_columns,
-                      used_flags,
-                      need_filter,
-                      join_features.need_flags,
-                      join_features.add_missing,
-                      true)
-                                  : joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
-                                      std::forward<std::vector<KeyGetter>>(key_getter_vector),
-                                      mapv,
-                                      added_columns,
-                                      used_flags,
-                                      need_filter,
-                                      join_features.need_flags,
-                                      join_features.add_missing,
-                                      mark_per_row_used);
-        }
-        else
-        {
-            return join_without_additional_filter();
+            bool mark_per_row_used = join_features.right || join_features.full || mapv.size() > 1;
+            return joinRightColumnsWithAddtitionalFilter<KeyGetter, Map, join_features.need_replication>(
+                std::forward<std::vector<KeyGetter>>(key_getter_vector),
+                mapv,
+                added_columns,
+                used_flags,
+                need_filter,
+                join_features.need_flags,
+                join_features.add_missing,
+                mark_per_row_used);
         }
     }
-    else
-    {
-        return join_without_additional_filter();
-    }
+
+    if (added_columns.additional_filter_expression)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Additional filter expression is not supported for this JOIN");
+
+    return mapv.size() > 1
+        ? joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, true>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags)
+        : joinRightColumns<KIND, STRICTNESS, KeyGetter, Map, need_filter, false>(std::forward<std::vector<KeyGetter>>(key_getter_vector), mapv, added_columns, used_flags);
 }
 
 template <JoinKind KIND, JoinStrictness STRICTNESS, typename KeyGetter, typename Map, typename AddedColumns>
@@ -2796,6 +2759,7 @@ void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additiona
 {
     if (!additional_filter_expression)
         return;
+
     Block expression_sample_block = additional_filter_expression->getSampleBlock();
 
     if (expression_sample_block.columns() != 1)
@@ -2818,7 +2782,7 @@ void HashJoin::validateAdditionalFilterExpression(ExpressionActionsPtr additiona
     if (!is_supported)
     {
         throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
-            "Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/FULL/RIGHT JOINs.",
+            "Non equi condition '{}' from JOIN ON section is supported only for ALL INNER/LEFT/FULL/RIGHT JOINs",
             expression_sample_block.getByPosition(0).name);
     }
 }
diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp
index 930881ec7d1..1fdf51f399f 100644
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@@ -459,7 +459,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
                 join_right_table_expressions,
                 join_node,
                 result.join_clauses.back());
-            is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition();
+            is_inequal_join |= !result.join_clauses.back().getMixedFilterConditionNodes().empty();
         }
     }
     else
@@ -476,7 +476,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
                 join_right_table_expressions,
                 join_node,
                 result.join_clauses.back());
-        is_inequal_join |= result.join_clauses.back().hasMixedFilterCondition();
+        is_inequal_join |= !result.join_clauses.back().getMixedFilterConditionNodes().empty();
     }
 
     auto and_function = FunctionFactory::instance().get("and", planner_context->getQueryContext());
@@ -595,9 +595,10 @@ JoinClausesAndActions buildJoinClausesAndActions(
     result.right_join_tmp_expression_actions = std::move(right_join_actions);
     result.right_join_expressions_actions->removeUnusedActions(join_right_actions_names);
 
-    /// If there is any inequal join condition, we need to build full join expressions actions.
     if (is_inequal_join)
     {
+        /// In case of multiple disjuncts and any inequal join condition, we need to build full join on expression actions.
+        /// So, for each column, we recalculate the value of the whole expression from JOIN ON to check if rows should be joined.
         if (result.join_clauses.size() > 1)
         {
             auto mixed_join_expressions_actions = std::make_shared<ActionsDAG>(mixed_table_expression_columns);
@@ -622,7 +623,7 @@ JoinClausesAndActions buildJoinClausesAndActions(
         auto outputs = result.mixed_join_expressions_actions->getOutputs();
         if (outputs.size() != 1)
         {
-            throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, "Only one output is expected. but got:\n{}", result.mixed_join_expressions_actions->dumpDAG());
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Only one output is expected, got: {}", result.mixed_join_expressions_actions->dumpDAG());
         }
         auto output_type = removeNullable(outputs[0]->result_type);
         WhichDataType which_type(output_type);
@@ -846,11 +847,12 @@ std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> & table_jo
     const Block & right_table_expression_header,
     const PlannerContextPtr & planner_context)
 {
-    if (table_join->getMixedJoinExpression() && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH)
+    if (table_join->getMixedJoinExpression()
+        && !table_join->isEnabledAlgorithm(JoinAlgorithm::HASH)
         && !table_join->isEnabledAlgorithm(JoinAlgorithm::GRACE_HASH))
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-            "JOIN with mixed conditions supports only hash join or grace hash join with one disjunct.");
+            "JOIN with mixed conditions supports only hash join or grace hash join");
     }
 
     trySetStorageInTableJoin(right_table_expression, table_join);
diff --git a/src/Planner/PlannerJoins.h b/src/Planner/PlannerJoins.h
index c96941a3c16..8adf6edd7ea 100644
--- a/src/Planner/PlannerJoins.h
+++ b/src/Planner/PlannerJoins.h
@@ -155,10 +155,6 @@ public:
         return mixed_filter_condition_nodes;
     }
 
-    bool hasMixedFilterCondition() const
-    {
-        return !mixed_filter_condition_nodes.empty();
-    }
     /// Dump clause into buffer
     void dump(WriteBuffer & buffer) const;
 
diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
index 89df825b32b..d3aa74f5c38 100644
--- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
+++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2
@@ -7,7 +7,7 @@ CREATE TABLE t2 (key String, attr String, a UInt64, b UInt64, c Nullable(UInt64)
 INSERT INTO t2 VALUES ('key1', 'A', 1, 2, 1), ('key1', 'B', 2, 1, 2), ('key1', 'C', 3, 4, 5), ('key1', 'D', 4, 1, 6), ('key3', 'a3', 1, 1, 1), ('key4', 'F', 1,1,1);
 
 SET allow_experimental_analyzer=1;
-SET allow_mixed_join_condition=1;
+SET allow_experimental_join_condition=1;
 SET join_use_nulls=0;
 -- { echoOn }
 {% for algorithm in ['hash', 'grace_hash'] -%}

From 5431e272c950b4bfc4b35175a60577fef7ae47ce Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 25 Apr 2024 16:39:26 +0800
Subject: [PATCH 075/192] update

---
 src/Interpreters/HashJoin.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index 56955066191..fc211ab5a83 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -1,8 +1,6 @@
 #include <any>
 #include <limits>
-#include <memory>
 #include <unordered_map>
-#include <utility>
 #include <vector>
 
 #include <Columns/ColumnConst.h>
@@ -1505,19 +1503,13 @@ ColumnPtr buildAdditionalFilter(
     if (selected_rows.empty())
         return ColumnUInt8::create();
     const Block & sample_right_block = *selected_rows.begin()->block;
-    if (!sample_right_block)
+    if (!sample_right_block || !added_columns.additional_filter_expression)
     {
         auto filter = ColumnUInt8::create();
         filter->insertMany(1, selected_rows.size());
         return filter;
     }
 
-    if (!added_columns.additional_filter_expression)
-    {
-        auto filter = ColumnUInt8::create();
-        filter->insertMany(1, selected_rows.size());
-        return filter;
-    }
     auto required_cols = added_columns.additional_filter_expression->getRequiredColumnsWithTypes();
     if (required_cols.empty())
     {

From 352d7a443b1e37205cba8fb256ec0ab770745aaa Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 25 Apr 2024 18:15:10 +0800
Subject: [PATCH 076/192] fixed: filter column is a const column

---
 src/Interpreters/HashJoin.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index fc211ab5a83..f8dc7af3264 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -1783,6 +1783,7 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                 left_start_row);
         }
         auto filter_col = buildAdditionalFilter(left_start_row, selected_rows, row_replicate_offset, added_columns);
+        filter_col = filter_col->convertToFullIfNeeded();
         copy_final_matched_rows(left_start_row, filter_col);
 
         if constexpr (need_replication)

From 0070d6fc71ab630d4cb288ff05c7e5640a61b602 Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Fri, 26 Apr 2024 09:35:29 +0800
Subject: [PATCH 077/192] fixed: converting filter column type

---
 src/Interpreters/HashJoin.cpp | 155 +++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 76 deletions(-)

diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index f8dc7af3264..f016b222f1b 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -1500,90 +1500,94 @@ ColumnPtr buildAdditionalFilter(
     const std::vector<size_t> & row_replicate_offset,
     AddedColumns & added_columns)
 {
-    if (selected_rows.empty())
-        return ColumnUInt8::create();
-    const Block & sample_right_block = *selected_rows.begin()->block;
-    if (!sample_right_block || !added_columns.additional_filter_expression)
+    ColumnPtr result_column;
+    do
     {
-        auto filter = ColumnUInt8::create();
-        filter->insertMany(1, selected_rows.size());
-        return filter;
-    }
-
-    auto required_cols = added_columns.additional_filter_expression->getRequiredColumnsWithTypes();
-    if (required_cols.empty())
-    {
-        Block block;
-        added_columns.additional_filter_expression->execute(block);
-        return block.getByPosition(0).column->cloneResized(selected_rows.size());
-    }
-    NameSet required_column_names;
-    for (auto & col : required_cols)
-    {
-        required_column_names.insert(col.name);
-    }
-
-    Block executed_block;
-    size_t right_col_pos = 0;
-    for (const auto & col : sample_right_block.getColumnsWithTypeAndName())
-    {
-        if (required_column_names.contains(col.name))
+        if (selected_rows.empty())
         {
-            auto new_col = col.column->cloneEmpty();
-            for (const auto & selected_row : selected_rows)
-            {
-                const auto & src_col = selected_row.block->getByPosition(right_col_pos);
-                new_col->insertFrom(*src_col.column, selected_row.row_num);
-            }
-            executed_block.insert({std::move(new_col), col.type, col.name});
+            result_column = ColumnUInt8::create();
+            break;
         }
-        right_col_pos += 1;
-    }
-    if (!executed_block)
-    {
-        return ColumnUInt8::create();
-    }
-
-    for (const auto & col_name : required_column_names)
-    {
-        const auto * src_col = added_columns.left_block.findByName(col_name);
-        if (!src_col)
-            continue;
-        auto new_col = src_col->column->cloneEmpty();
-        size_t prev_left_offset = 0;
-        for (size_t i = 1; i < row_replicate_offset.size(); ++i)
+        const Block & sample_right_block = *selected_rows.begin()->block;
+        if (!sample_right_block || !added_columns.additional_filter_expression)
         {
-            const size_t & left_offset = row_replicate_offset[i];
-            size_t rows = left_offset - prev_left_offset;
-            if (rows)
-            {
-                new_col->insertManyFrom(*src_col->column, left_start_row + i - 1, rows);
-            }
-            prev_left_offset = left_offset;
+            auto filter = ColumnUInt8::create();
+            filter->insertMany(1, selected_rows.size());
+            result_column = std::move(filter);
+            break;
         }
-        executed_block.insert({std::move(new_col), src_col->type, col_name});
-    }
-    if (!executed_block)
-    {
-        throw Exception(
-            ErrorCodes::LOGICAL_ERROR,
-            "required columns: [{}], but not found any in left/right table. right table: {}, left table: {}",
-            required_cols.toString(),
-            sample_right_block.dumpNames(),
-            added_columns.left_block.dumpNames());
-    }
 
-    for (const auto & col : executed_block.getColumnsWithTypeAndName())
-    {
-        if (!col.column || !col.type)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Illegal nullptr column in input block: {}", executed_block.dumpStructure());
-    }
+        auto required_cols = added_columns.additional_filter_expression->getRequiredColumnsWithTypes();
+        if (required_cols.empty())
+        {
+            Block block;
+            added_columns.additional_filter_expression->execute(block);
+            result_column = block.getByPosition(0).column->cloneResized(selected_rows.size());
+            break;
+        }
+        NameSet required_column_names;
+        for (auto & col : required_cols)
+            required_column_names.insert(col.name);
 
-    added_columns.additional_filter_expression->execute(executed_block);
+        Block executed_block;
+        size_t right_col_pos = 0;
+        for (const auto & col : sample_right_block.getColumnsWithTypeAndName())
+        {
+            if (required_column_names.contains(col.name))
+            {
+                auto new_col = col.column->cloneEmpty();
+                for (const auto & selected_row : selected_rows)
+                {
+                    const auto & src_col = selected_row.block->getByPosition(right_col_pos);
+                    new_col->insertFrom(*src_col.column, selected_row.row_num);
+                }
+                executed_block.insert({std::move(new_col), col.type, col.name});
+            }
+            right_col_pos += 1;
+        }
+        if (!executed_block)
+        {
+            result_column = ColumnUInt8::create();
+            break;
+        }
 
-    ColumnPtr result_column = executed_block.getByPosition(0).column->convertToFullColumnIfConst();
-    executed_block.clear();
+        for (const auto & col_name : required_column_names)
+        {
+            const auto * src_col = added_columns.left_block.findByName(col_name);
+            if (!src_col)
+                continue;
+            auto new_col = src_col->column->cloneEmpty();
+            size_t prev_left_offset = 0;
+            for (size_t i = 1; i < row_replicate_offset.size(); ++i)
+            {
+                const size_t & left_offset = row_replicate_offset[i];
+                size_t rows = left_offset - prev_left_offset;
+                if (rows)
+                    new_col->insertManyFrom(*src_col->column, left_start_row + i - 1, rows);
+                prev_left_offset = left_offset;
+            }
+            executed_block.insert({std::move(new_col), src_col->type, col_name});
+        }
+        if (!executed_block)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "required columns: [{}], but not found any in left/right table. right table: {}, left table: {}",
+                required_cols.toString(),
+                sample_right_block.dumpNames(),
+                added_columns.left_block.dumpNames());
+        }
 
+        for (const auto & col : executed_block.getColumnsWithTypeAndName())
+            if (!col.column || !col.type)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Illegal nullptr column in input block: {}", executed_block.dumpStructure());
+
+        added_columns.additional_filter_expression->execute(executed_block);
+        result_column = executed_block.getByPosition(0).column->convertToFullColumnIfConst();
+        executed_block.clear();
+    } while (false);
+
+    result_column = result_column->convertToFullIfNeeded();
     if (result_column->isNullable())
     {
         /// Convert Nullable(UInt8) to UInt8 ensuring that nulls are zeros
@@ -1783,7 +1787,6 @@ NO_INLINE size_t joinRightColumnsWithAddtitionalFilter(
                 left_start_row);
         }
         auto filter_col = buildAdditionalFilter(left_start_row, selected_rows, row_replicate_offset, added_columns);
-        filter_col = filter_col->convertToFullIfNeeded();
         copy_final_matched_rows(left_start_row, filter_col);
 
         if constexpr (need_replication)

From c4e0cf2e7d02c5fe998bbaa9a890507f54ccf0f3 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Wed, 31 Jan 2024 06:41:32 +0000
Subject: [PATCH 078/192] rocksdb: implement ALTER SETTING

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../RocksDB/EmbeddedRocksDBBulkSink.cpp       |  9 ++--
 .../RocksDB/EmbeddedRocksDBBulkSink.h         |  1 +
 .../RocksDB/StorageEmbeddedRocksDB.cpp        | 43 +++++++++++++++++--
 src/Storages/RocksDB/StorageEmbeddedRocksDB.h | 14 ++++--
 4 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index 6d9c320684d..dbaa5a8afea 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -200,17 +200,17 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk_)
 
     auto [serialized_key_column, serialized_value_column] = serializeChunks(to_written);
     auto sst_file_path = getTemporarySSTFilePath();
-    if (auto status = buildSSTFile(path, *serialized_key_column, *serialized_value_column); !status.ok())
+    if (auto status = buildSSTFile(sst_file_path, *serialized_key_column, *serialized_value_column); !status.ok())
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
 
     /// Ingest the SST file
     rocksdb::IngestExternalFileOptions ingest_options;
     ingest_options.move_files = true; /// The temporary file is on the same disk, so move (or hardlink) file will be faster than copy
-    if (auto status = storage.rocksdb_ptr->IngestExternalFile({path}, ingest_options); !status.ok())
+    if (auto status = storage.rocksdb_ptr->IngestExternalFile({sst_file_path}, ingest_options); !status.ok())
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
 
-    if (fs::exists(path))
-        fs::remove(path);
+    if (fs::exists(sst_file_path))
+        fs::remove(sst_file_path);
 }
 
 void EmbeddedRocksDBBulkSink::onFinish()
@@ -220,7 +220,6 @@ void EmbeddedRocksDBBulkSink::onFinish()
         consume({});
 }
 
-
 String EmbeddedRocksDBBulkSink::getTemporarySSTFilePath()
 {
     return fs::path(insert_directory_queue) / (toString(file_counter++) + ".sst");
diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
index fe28576a4a3..26a4f7c7fc3 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
@@ -14,6 +14,7 @@
 
 namespace DB
 {
+namespace fs = std::filesystem;
 
 class StorageEmbeddedRocksDB;
 class EmbeddedRocksDBBulkSink;
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
index f72bc0e513b..af057b817d4 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
@@ -27,6 +27,7 @@
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Common/logger_useful.h>
 #include <Common/Exception.h>
+#include <Storages/AlterCommands.h>
 #include <Storages/RocksDB/RocksDBSettings.h>
 #include <IO/SharedThreadPools.h>
 #include <Disks/DiskLocal.h>
@@ -179,7 +180,7 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_,
         const StorageInMemoryMetadata & metadata_,
         LoadingStrictnessLevel mode,
         ContextPtr context_,
-        RocksDBSettings settings_,
+        std::unique_ptr<RocksDBSettings> settings_,
         const String & primary_key_,
         Int32 ttl_,
         String rocksdb_dir_,
@@ -629,8 +630,20 @@ static StoragePtr create(const StorageFactory::Arguments & args)
     {
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "StorageEmbeddedRocksDB must require one column in primary key");
     }
-    RocksDBSettings settings;
-    settings.loadFromQuery(*args.storage_def, args.getContext());
+    auto settings = std::make_unique<RocksDBSettings>();
+    settings->loadFromQuery(*args.storage_def, args.getContext());
+    if (args.storage_def->settings)
+        metadata.settings_changes = args.storage_def->settings->ptr();
+    else
+    {
+        /// A workaround because embedded rocksdb doesn't have default immutable settings
+        /// But InterpreterAlterQuery requires settings_changes to be set to run ALTER MODIFY
+        /// SETTING queries. So we just add a setting with its default value.
+        auto settings_changes = std::make_shared<ASTSetQuery>();
+        settings_changes->is_standalone = false;
+        settings_changes->changes.insertSetting("optimize_for_bulk_insert", settings->optimize_for_bulk_insert.value);
+        metadata.settings_changes = settings_changes;
+    }
     return std::make_shared<StorageEmbeddedRocksDB>(args.table_id, args.relative_data_path, metadata, args.mode, args.getContext(), std::move(settings), primary_key_names[0], ttl, std::move(rocksdb_dir), read_only);
 }
 
@@ -746,6 +759,22 @@ std::optional<UInt64> StorageEmbeddedRocksDB::totalBytes(const Settings & /*sett
     return estimated_bytes;
 }
 
+void StorageEmbeddedRocksDB::alter(
+    const AlterCommands & params,
+    ContextPtr query_context,
+    AlterLockHolder & holder)
+{
+    IStorage::alter(params, query_context, holder);
+    auto new_metadata = getInMemoryMetadataPtr();
+    if (new_metadata->settings_changes)
+    {
+        const auto & settings_changes = new_metadata->settings_changes->as<const ASTSetQuery &>();
+        auto new_settings = std::make_unique<RocksDBSettings>();
+        new_settings->applyChanges(settings_changes.changes);
+        setSettings(std::move(new_settings));
+    }
+}
+
 void registerStorageEmbeddedRocksDB(StorageFactory & factory)
 {
     StorageFactory::StorageFeatures features{
@@ -757,4 +786,12 @@ void registerStorageEmbeddedRocksDB(StorageFactory & factory)
 
     factory.registerStorage("EmbeddedRocksDB", create, features);
 }
+
+void StorageEmbeddedRocksDB::checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const
+{
+    for (const auto & command : commands)
+        if (!command.isCommentAlter() && !command.isSettingsAlter())
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Alter of type '{}' is not supported by storage {}", command.type, getName());
+}
+
 }
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
index 456a912d8d5..9fc58ea6b38 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <memory>
+#include <Common/MultiVersion.h>
 #include <Common/SharedMutex.h>
 #include <Interpreters/IKeyValueEntity.h>
 #include <rocksdb/status.h>
@@ -37,7 +38,7 @@ public:
         const StorageInMemoryMetadata & metadata,
         LoadingStrictnessLevel mode,
         ContextPtr context_,
-        RocksDBSettings settings_,
+        std::unique_ptr<RocksDBSettings> settings_,
         const String & primary_key_,
         Int32 ttl_ = 0,
         String rocksdb_dir_ = "",
@@ -63,6 +64,7 @@ public:
     void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override;
     void mutate(const MutationCommands &, ContextPtr) override;
     void drop() override;
+    void alter(const AlterCommands & params, ContextPtr query_context, AlterLockHolder &) override;
 
     bool optimize(
         const ASTPtr & query,
@@ -103,12 +105,16 @@ public:
 
     std::optional<UInt64> totalBytes(const Settings & settings) const override;
 
-    const RocksDBSettings & getSettings() const { return settings; }
+    void checkAlterIsPossible(const AlterCommands & commands, ContextPtr /* context */) const override;
 
-    void setSettings(RocksDBSettings settings_) { settings = std::move(settings_); }
+    const RocksDBSettings & getSettings() const { return *storage_settings.get(); }
+
+    void setSettings(std::unique_ptr<RocksDBSettings> && settings_) { storage_settings.set(std::move(settings_)); }
 
 private:
-    RocksDBSettings settings;
+    SinkToStoragePtr getSink(ContextPtr context, const StorageMetadataPtr & metadata_snapshot);
+
+    MultiVersion<RocksDBSettings> storage_settings;
     const String primary_key;
     using RocksDBPtr = std::unique_ptr<rocksdb::DB>;
     RocksDBPtr rocksdb_ptr;

From 72199a79be60dc9ebba33bf12adf00064c6aceb3 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Thu, 1 Feb 2024 09:26:59 +0000
Subject: [PATCH 079/192] better rocksdb test

- add test for alter modify setting
- make sure bulk insert kick in

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../0_stateless/02956_rocksdb_bulk_sink.reference    |  3 +++
 .../queries/0_stateless/02956_rocksdb_bulk_sink.sql  | 12 ++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
index 83b33d238da..6232e00f9c9 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
@@ -1 +1,4 @@
+1
+1000
+0
 1000
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
index d685afadf81..34795546f7c 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
@@ -1,6 +1,10 @@
 -- Tags: no-ordinary-database, use-rocksdb
--- Tag no-ordinary-database: Sometimes cannot lock file most likely due to concurrent or adjacent tests, but we don't care how it works in Ordinary database
--- Tag no-fasttest: In fasttest, ENABLE_LIBRARIES=0, so rocksdb engine is not enabled by default
-CREATE TABLE rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 1;
-INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000) SETTINGS max_insert_threads = 2;
+CREATE TABLE IF NOT EXISTS rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 1;
+INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);
+SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 1
+SELECT count() FROM rocksdb_worm;
+TRUNCATE TABLE rocksdb_worm;
+ALTER TABLE rocksdb_worm MODIFY SETTING optimize_for_bulk_insert = 0;
+INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);
+SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 0 because all data is still in memtable
 SELECT count() FROM rocksdb_worm;

From fcc77379aea1d557dd30c278cdad5c89b29b7295 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Sun, 28 Apr 2024 09:50:42 +0000
Subject: [PATCH 080/192] Cosmetics, pt. IV

---
 .../sql-reference/functions/uuid-functions.md | 138 ++++++++++----
 src/Functions/FunctionsCodingUUID.cpp         |  32 ++--
 src/Functions/generateUUIDv4.cpp              |  25 ++-
 src/Functions/generateUUIDv7.cpp              | 170 ++++++++----------
 src/Functions/locate.cpp                      |   2 +-
 tests/queries/0_stateless/00396_uuid_v7.sql   |  10 +-
 .../0_stateless/02310_uuid_v7.reference       |   2 +-
 tests/queries/0_stateless/02310_uuid_v7.sql   |  22 +--
 8 files changed, 223 insertions(+), 178 deletions(-)

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index 3681f3d76b4..56dea04cfeb 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -8,17 +8,17 @@ sidebar_label: UUIDs
 
 ## generateUUIDv4
 
-Generates a [UUID](../data-types/uuid.md) of [version 4](https://tools.ietf.org/html/rfc4122#section-4.4).
+Generates a [version 4](https://tools.ietf.org/html/rfc4122#section-4.4) [UUID](../data-types/uuid.md).
 
 **Syntax**
 
 ``` sql
-generateUUIDv4([x])
+generateUUIDv4([expr])
 ```
 
 **Arguments**
 
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
+- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional.
 
 **Returned value**
 
@@ -36,13 +36,15 @@ INSERT INTO tab SELECT generateUUIDv4();
 SELECT * FROM tab;
 ```
 
+Result:
+
 ```response
 ┌─────────────────────────────────uuid─┐
 │ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │
 └──────────────────────────────────────┘
 ```
 
-**Example where multiple UUIDs are generated per row**
+**Example with multiple UUIDs generated per row**
 
 ```sql
 SELECT generateUUIDv4(1), generateUUIDv4(2);
@@ -52,17 +54,15 @@ SELECT generateUUIDv4(1), generateUUIDv4(2);
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
-## generateUUIDv7 {#uuidv7-function-generate}
+## generateUUIDv7 {#generateUUIDv7}
 
-Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
+Generates a [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04) [UUID](../data-types/uuid.md).
 
 The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits).
-At any given new timestamp in unix_ts_ms, the counter starts from some random value and then it's being increased by 1 on each new UUID v7 with counter generation until current timestamp changes.
-The counter overflow causes unix_ts_ms field increment by 1 and the counter restart from a random value. Counter increment monotony at one timestamp is guaranteed across all `generateUUIDv7` functions running simultaneously.
+For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes.
+In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value.
 
-:::note
-As of April 2024 UUIDv7 is only a draft and the layout may change in future.
-:::
+Function `generateUUIDv7` guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.
 
 ```
  0                   1                   2                   3
@@ -78,15 +78,19 @@ As of April 2024 UUIDv7 is only a draft and the layout may change in future.
 └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
 ```
 
+:::note
+As of April 2024, version 7 UUIDs are in draft status and their layout may change in future.
+:::
+
 **Syntax**
 
 ``` sql
-generateUUIDv7([x])
+generateUUIDv7([expr])
 ```
 
 **Arguments**
 
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
+- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional.
 
 **Returned value**
 
@@ -104,13 +108,15 @@ INSERT INTO tab SELECT generateUUIDv7();
 SELECT * FROM tab;
 ```
 
+Result:
+
 ```response
 ┌─────────────────────────────────uuid─┐
 │ 018f05af-f4a8-778f-beee-1bedbc95c93b │
 └──────────────────────────────────────┘
 ```
 
-**Example where multiple UUIDs are generated per row**
+**Example with multiple UUIDs generated per row**
 
 ```sql
 SELECT generateUUIDv7(1), generateUUIDv7(2);
@@ -120,21 +126,44 @@ SELECT generateUUIDv7(1), generateUUIDv7(2);
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
-## generateUUIDv7WithFastCounter
+## generateUUIDv7ThreadMonotonic
 
 Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
 
-This function behaves like [generateUUIDv7](#uuidv7-function-generate) but gives no guarantee on counter monotony across different requests running simultaneously. Counter increment monotony at one timestamp is guaranteed only within one thread calling this function to generate many UUIDs.
+The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits).
+For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes.
+In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value.
+
+This function behaves like [generateUUIDv7](#generateUUIDv7) but gives no guarantee on counter monotony across different simultaneous requests.
+Monotonocity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.
+
+```
+ 0                   1                   2                   3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                           unix_ts_ms                          |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|          unix_ts_ms           |  ver  |   counter_high_bits   |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|var|                   counter_low_bits                        |
+├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
+|                            rand_b                             |
+└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
+```
+
+:::note
+As of April 2024, version 7 UUIDs are in draft status and their layout may change in future.
+:::
 
 **Syntax**
 
 ``` sql
-generateUUIDv7WithFastCounter([x])
+generateUUIDv7ThreadMonotonic([expr])
 ```
 
 **Arguments**
 
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
+- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional.
 
 **Returned value**
 
@@ -147,38 +176,36 @@ First, create a table with a column of type UUID, then insert a generated UUIDv7
 ``` sql
 CREATE TABLE tab (uuid UUID) ENGINE = Memory;
 
-INSERT INTO tab SELECT generateUUIDv7WithFastCounter();
+INSERT INTO tab SELECT generateUUIDv7ThreadMonotonic();
 
 SELECT * FROM tab;
 ```
 
+Result:
+
 ```response
 ┌─────────────────────────────────uuid─┐
 │ 018f05e2-e3b2-70cb-b8be-64b09b626d32 │
 └──────────────────────────────────────┘
 ```
 
-**Example where multiple UUIDs are generated per row**
+**Example with multiple UUIDs generated per row**
 
 ```sql
-SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(2);
+SELECT generateUUIDv7ThreadMonotonic(1), generateUUIDv7ThreadMonotonic(2);
 
-┌─generateUUIDv7WithFastCounter(1)─────┬─generateUUIDv7WithFastCounter(2)─────┐
-│ 018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
+┌─generateUUIDv7ThreadMonotonic(1)─────┬─generateUUIDv7ThreadMonotonic(2)─────┐
+  018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
-
 ## generateUUIDv7NonMonotonic
 
 Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
 
-The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits) and a random field (76 bits) (including a variant field "2", 2 bit).
-The monotonicity within one timestamp is not guaranteed at all. This is the fastest version of `generateUUIDv7*` functions family.
+The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits) and random values (76 bits, including a 2-bit variant field "2").
 
-:::note
-As of April 2024 UUIDv7 is only a draft and the layout may change in future.
-:::
+This function is the fastest `generateUUIDv7*` function but it gives no monotonocity guarantees within a timestamp.
 
 ```
  0                   1                   2                   3
@@ -194,15 +221,19 @@ As of April 2024 UUIDv7 is only a draft and the layout may change in future.
 └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
 ```
 
+:::note
+As of April 2024, version 7 UUIDs are in draft status and their layout may change in future.
+:::
+
 **Syntax**
 
 ``` sql
-generateUUIDv7NonMonotonic([x])
+generateUUIDv7NonMonotonic([expr])
 ```
 
 **Arguments**
 
-- `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The expression is used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query but otherwise ignored. Optional.
+- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional.
 
 **Returned value**
 
@@ -220,13 +251,15 @@ INSERT INTO tab SELECT generateUUIDv7NonMonotonic();
 SELECT * FROM tab;
 ```
 
+Result:
+
 ```response
 ┌─────────────────────────────────uuid─┐
 │ 018f05af-f4a8-778f-beee-1bedbc95c93b │
 └──────────────────────────────────────┘
 ```
 
-**Example where multiple UUIDs are generated per row**
+**Example with multiple UUIDs generated per row**
 
 ```sql
 SELECT generateUUIDv7NonMonotonic(1), generateUUIDv7NonMonotonic(2);
@@ -338,6 +371,8 @@ The UUID type value.
 SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid
 ```
 
+Result:
+
 ```response
 ┌─────────────────────────────────uuid─┐
 │ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │
@@ -370,6 +405,9 @@ This first example returns the first argument converted to a UUID type as it can
 ``` sql
 SELECT toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID));
 ```
+
+Result:
+
 ```response
 ┌─toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', CAST('59f0c404-5cb3-11e7-907b-a6006ad3dba0', 'UUID'))─┐
 │ 61f0c404-5cb3-11e7-907b-a6006ad3dba0                                                                          │
@@ -382,6 +420,8 @@ This second example returns the second argument (the provided default UUID) as t
 SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID));
 ```
 
+Result:
+
 ```response
 ┌─toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', CAST('59f0c404-5cb3-11e7-907b-a6006ad3dba0', 'UUID'))─┐
 │ 59f0c404-5cb3-11e7-907b-a6006ad3dba0                                                                               │
@@ -406,6 +446,8 @@ The Nullable(UUID) type value.
 SELECT toUUIDOrNull('61f0c404-5cb3-11e7-907b-a6006ad3dba0T') AS uuid
 ```
 
+Result:
+
 ```response
 ┌─uuid─┐
 │ ᴺᵁᴸᴸ │
@@ -430,6 +472,8 @@ The UUID type value.
 SELECT toUUIDOrZero('61f0c404-5cb3-11e7-907b-a6006ad3dba0T') AS uuid
 ```
 
+Result:
+
 ```response
 ┌─────────────────────────────────uuid─┐
 │ 00000000-0000-0000-0000-000000000000 │
@@ -463,6 +507,8 @@ SELECT
     UUIDStringToNum(uuid) AS bytes
 ```
 
+Result:
+
 ```response
 ┌─uuid─────────────────────────────────┬─bytes────────────┐
 │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │
@@ -475,6 +521,8 @@ SELECT
     UUIDStringToNum(uuid, 2) AS bytes
 ```
 
+Result:
+
 ```response
 ┌─uuid─────────────────────────────────┬─bytes────────────┐
 │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ @</a;]~!p{jTj={) │
@@ -508,6 +556,8 @@ SELECT
     UUIDNumToString(toFixedString(bytes, 16)) AS uuid
 ```
 
+Result:
+
 ```response
 ┌─bytes────────────┬─uuid─────────────────────────────────┐
 │ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │
@@ -520,6 +570,8 @@ SELECT
     UUIDNumToString(toFixedString(bytes, 16), 2) AS uuid
 ```
 
+Result:
+
 ```response
 ┌─bytes────────────┬─uuid─────────────────────────────────┐
 │ @</a;]~!p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │
@@ -528,7 +580,7 @@ SELECT
 
 ## UUIDToNum
 
-Accepts `UUID` and returns a [FixedString(16)](../../sql-reference/data-types/fixedstring.md) as its binary representation, with its format optionally specified by `variant` (`Big-endian` by default). This function replaces calls to two separate functions `UUIDStringToNum(toString(uuid))` so intermediate conversion from UUID to string is not required to extract bytes from a UUID.
+Accepts a [UUID](../../sql-reference/data-types/uuid.md) and returns its binary representation as a [FixedString(16)](../../sql-reference/data-types/fixedstring.md), with its format optionally specified by `variant` (`Big-endian` by default). This function replaces calls to two separate functions `UUIDStringToNum(toString(uuid))` so no intermediate conversion from UUID to string is required to extract bytes from a UUID.
 
 **Syntax**
 
@@ -543,7 +595,7 @@ UUIDToNum(uuid[, variant = 1])
 
 **Returned value**
 
-FixedString(16)
+The binary representation of the UUID.
 
 **Usage examples**
 
@@ -553,6 +605,8 @@ SELECT
     UUIDToNum(uuid) AS bytes
 ```
 
+Result:
+
 ```response
 ┌─uuid─────────────────────────────────┬─bytes────────────┐
 │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │
@@ -565,6 +619,8 @@ SELECT
     UUIDToNum(uuid, 2) AS bytes
 ```
 
+Result:
+
 ```response
 ┌─uuid─────────────────────────────────┬─bytes────────────┐
 │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ @</a;]~!p{jTj={) │
@@ -573,7 +629,7 @@ SELECT
 
 ## UUIDv7ToDateTime
 
-Accepts `UUID` having version 7 and extracts the timestamp from it. 
+Returns the timestamp component of a UUID version 7.
 
 **Syntax**
 
@@ -588,7 +644,7 @@ UUIDv7ToDateTime(uuid[, timezone])
 
 **Returned value**
 
-- Timestamp with milliseconds precision (1970-01-01 00:00:00.000 in case of non version 7 UUID).
+- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000.
 
 Type: [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md).
 
@@ -598,6 +654,8 @@ Type: [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md).
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))
 ```
 
+Result:
+
 ```response
 ┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))─┐
 │                                          2024-04-22 15:30:29.048 │
@@ -608,6 +666,8 @@ SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
 ```
 
+Result:
+
 ```response
 ┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')─┐
 │                                                              2024-04-22 08:30:29.048 │
@@ -616,7 +676,7 @@ SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America
 
 ## serverUUID()
 
-Returns the random and unique UUID, which is generated when the server is first started and stored forever. The result writes to the file `uuid` created in the ClickHouse server directory `/var/lib/clickhouse/`. 
+Returns the random UUID generated during the first start of the ClickHouse server. The UUID is stored in file `uuid` in the ClickHouse server directory (e.g. `/var/lib/clickhouse/`) and retained between server restarts.
 
 **Syntax**
 
@@ -626,10 +686,10 @@ serverUUID()
 
 **Returned value**
 
-- The UUID of the server. 
+- The UUID of the server.
 
 Type: [UUID](../data-types/uuid.md).
 
-## See Also
+## See also
 
 - [dictGetUUID](../../sql-reference/functions/ext-dict-functions.md#ext_dict_functions-other)
diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index a6fc9779ae1..c1e67f39950 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -21,11 +21,11 @@
 
 namespace DB::ErrorCodes
 {
-extern const int ARGUMENT_OUT_OF_BOUND;
-extern const int ILLEGAL_COLUMN;
-extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-extern const int LOGICAL_ERROR;
-extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ARGUMENT_OUT_OF_BOUND;
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int LOGICAL_ERROR;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }
 
 namespace
@@ -36,7 +36,7 @@ enum class Representation
     LittleEndian
 };
 
-std::pair<int, int> determineBinaryStartIndexWithIncrement(const ptrdiff_t num_bytes, const Representation representation)
+std::pair<int, int> determineBinaryStartIndexWithIncrement(ptrdiff_t num_bytes, Representation representation)
 {
     if (representation == Representation::BigEndian)
         return {0, 1};
@@ -46,7 +46,7 @@ std::pair<int, int> determineBinaryStartIndexWithIncrement(const ptrdiff_t num_b
     throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "{} is not handled yet", magic_enum::enum_name(representation));
 }
 
-void formatHex(const std::span<const UInt8> src, UInt8 * dst, const Representation representation)
+void formatHex(const std::span<const UInt8> src, UInt8 * dst, Representation representation)
 {
     const auto src_size = std::ssize(src);
     const auto [src_start_index, src_increment] = determineBinaryStartIndexWithIncrement(src_size, representation);
@@ -54,7 +54,7 @@ void formatHex(const std::span<const UInt8> src, UInt8 * dst, const Representati
         writeHexByteLowercase(src[src_pos], dst + dst_pos);
 }
 
-void parseHex(const UInt8 * __restrict src, const std::span<UInt8> dst, const Representation representation)
+void parseHex(const UInt8 * __restrict src, const std::span<UInt8> dst, Representation representation)
 {
     const auto dst_size = std::ssize(dst);
     const auto [dst_start_index, dst_increment] = determineBinaryStartIndexWithIncrement(dst_size, representation);
@@ -332,6 +332,7 @@ public:
 
     String getName() const override { return name; }
     size_t getNumberOfArguments() const override { return 0; }
+    bool useDefaultImplementationForConstants() const override { return true; }
     bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
     bool isVariadic() const override { return true; }
@@ -354,14 +355,13 @@ public:
         return std::make_shared<DataTypeFixedString>(uuid_bytes_length);
     }
 
-    bool useDefaultImplementationForConstants() const override { return true; }
-
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
     {
         const ColumnWithTypeAndName & col_type_name = arguments[0];
         const ColumnPtr & column = col_type_name.column;
 
         const bool defaultFormat = (parseVariant(arguments) == UUIDSerializer::Variant::Default);
+
         if (const auto * col_in = checkAndGetColumn<ColumnUUID>(column.get()))
         {
             const auto & vec_in = col_in->getData();
@@ -403,11 +403,13 @@ class FunctionUUIDv7ToDateTime : public IFunction
 {
 public:
     static constexpr auto name = "UUIDv7ToDateTime";
-    static constexpr UInt32 DATETIME_SCALE = 3;
     static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUUIDv7ToDateTime>(); }
 
+    static constexpr UInt32 datetime_scale = 3;
+
     String getName() const override { return name; }
     size_t getNumberOfArguments() const override { return 0; }
+    bool useDefaultImplementationForConstants() const override { return true; }
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
     bool isVariadic() const override { return true; }
 
@@ -438,11 +440,9 @@ public:
                     getName());
         }
 
-        return std::make_shared<DataTypeDateTime64>(DATETIME_SCALE, timezone);
+        return std::make_shared<DataTypeDateTime64>(datetime_scale, timezone);
     }
 
-    bool useDefaultImplementationForConstants() const override { return true; }
-
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
     {
         const ColumnWithTypeAndName & col_type_name = arguments[0];
@@ -454,7 +454,7 @@ public:
             const UUID * uuids = vec_in.data();
             const size_t size = vec_in.size();
 
-            auto col_res = ColumnDateTime64::create(size, DATETIME_SCALE);
+            auto col_res = ColumnDateTime64::create(size, datetime_scale);
             auto & vec_res = col_res->getData();
 
             for (size_t i = 0; i < size; ++i)
@@ -464,7 +464,7 @@ public:
                 {
                     uint64_t ms = hiBytes >> 16;
                     vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(
-                        ms / intExp10(DATETIME_SCALE), ms % intExp10(DATETIME_SCALE), DATETIME_SCALE);
+                        ms / intExp10(datetime_scale), ms % intExp10(datetime_scale), datetime_scale);
                 }
             }
 
diff --git a/src/Functions/generateUUIDv4.cpp b/src/Functions/generateUUIDv4.cpp
index 61546d35069..b0fec43fe94 100644
--- a/src/Functions/generateUUIDv4.cpp
+++ b/src/Functions/generateUUIDv4.cpp
@@ -1,15 +1,11 @@
-#include <Functions/FunctionFactory.h>
-#include <Functions/FunctionsRandom.h>
 #include <DataTypes/DataTypeUUID.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/FunctionsRandom.h>
 
 namespace DB
 {
 
-namespace ErrorCodes
-{
-    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-}
-
 #define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \
 DECLARE_DEFAULT_CODE      (__VA_ARGS__) \
 DECLARE_AVX2_SPECIFIC_CODE(__VA_ARGS__)
@@ -30,12 +26,13 @@ public:
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
     bool isVariadic() const override { return true; }
 
-    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
     {
-        if (arguments.size() > 1)
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
-                "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
-                getName(), arguments.size());
+        FunctionArgumentDescriptors mandatory_args;
+        FunctionArgumentDescriptors optional_args{
+            {"expr", nullptr, nullptr, "Arbitrary Expression"}
+        };
+        validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
 
         return std::make_shared<DataTypeUUID>();
     }
@@ -74,10 +71,10 @@ public:
         selector.registerImplementation<TargetArch::Default,
             TargetSpecific::Default::FunctionGenerateUUIDv4>();
 
-    #if USE_MULTITARGET_CODE
+#if USE_MULTITARGET_CODE
         selector.registerImplementation<TargetArch::AVX2,
             TargetSpecific::AVX2::FunctionGenerateUUIDv4>();
-    #endif
+#endif
     }
 
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 7f3a97cb0a6..edbfe0594d3 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -1,19 +1,15 @@
 #include <DataTypes/DataTypeUUID.h>
 #include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
 #include <Functions/FunctionsRandom.h>
 
 namespace DB
 {
 
-namespace ErrorCodes
-{
-extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-}
-
 namespace
 {
 
-/* Bit layouts of the UUIDv7
+/* Bit layouts of UUIDv7
 
 without counter:
  0                   1                   2                   3
@@ -42,7 +38,7 @@ with counter:
 └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
 */
 
-// bit counts
+/// bit counts
 constexpr auto rand_a_bits_count = 12;
 constexpr auto rand_b_bits_count = 62;
 constexpr auto rand_b_low_bits_count = 32;
@@ -51,15 +47,15 @@ constexpr auto counter_low_bits_count = 30;
 constexpr auto bits_in_counter = counter_high_bits_count + counter_low_bits_count;
 constexpr uint64_t counter_limit = (1ull << bits_in_counter);
 
-// bit masks for UUIDv7 parts
+/// bit masks for UUIDv7 components
 constexpr uint64_t variant_2_mask  = (2ull << rand_b_bits_count);
 constexpr uint64_t rand_a_bits_mask = (1ull << rand_a_bits_count) - 1;
 constexpr uint64_t rand_b_bits_mask = (1ull << rand_b_bits_count) - 1;
 constexpr uint64_t rand_b_with_counter_bits_mask = (1ull << rand_b_low_bits_count) - 1;
 constexpr uint64_t counter_low_bits_mask = (1ull << counter_low_bits_count) - 1;
-constexpr auto counter_high_bits_mask = rand_a_bits_mask;
+constexpr uint64_t counter_high_bits_mask = rand_a_bits_mask;
 
-inline uint64_t getTimestampMillisecond()
+uint64_t getTimestampMillisecond()
 {
     timespec tp;
     clock_gettime(CLOCK_REALTIME, &tp);
@@ -67,12 +63,12 @@ inline uint64_t getTimestampMillisecond()
     return sec * 1000 + tp.tv_nsec / 1000000;
 }
 
-inline void setTimestamp(UUID & uuid, uint64_t timestamp)
+void setTimestampAndVersion(UUID & uuid, uint64_t timestamp)
 {
-    UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & rand_a_bits_mask) | ( timestamp << 16) | 0x7000; // version 7
+    UUIDHelpers::getHighBytes(uuid) = (UUIDHelpers::getHighBytes(uuid) & rand_a_bits_mask) | (timestamp << 16) | 0x7000;
 }
 
-inline void setVariant(UUID & uuid)
+void setVariant(UUID & uuid)
 {
     UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & rand_b_bits_mask) | variant_2_mask;
 }
@@ -85,7 +81,7 @@ struct FillAllRandomPolicy
     {
         void generate(UUID & uuid, uint64_t ts)
         {
-            setTimestamp(uuid, ts);
+            setTimestampAndVersion(uuid, ts);
             setVariant(uuid);
         }
     };
@@ -93,97 +89,90 @@ struct FillAllRandomPolicy
 
 struct CounterFields
 {
-    uint64_t timestamp = 0;
+    uint64_t last_timestamp = 0;
     uint64_t counter = 0;
 
-    void resetCounter(const UUID& uuid)
+    void resetCounter(const UUID & uuid)
     {
-        const uint64_t counterLowBits = (UUIDHelpers::getLowBytes(uuid) >> rand_b_low_bits_count) & counter_low_bits_mask;
-        const uint64_t counterHighBits = UUIDHelpers::getHighBytes(uuid) & counter_high_bits_mask;
-        counter = (counterHighBits << 30) | counterLowBits;
+        const uint64_t counter_low_bits = (UUIDHelpers::getLowBytes(uuid) >> rand_b_low_bits_count) & counter_low_bits_mask;
+        const uint64_t counter_high_bits = UUIDHelpers::getHighBytes(uuid) & counter_high_bits_mask;
+        counter = (counter_high_bits << 30) | counter_low_bits;
     }
 
-    void incrementCounter(UUID& uuid)
+    void incrementCounter(UUID & uuid)
     {
         if (++counter == counter_limit) [[unlikely]]
         {
-            ++timestamp;
+            ++last_timestamp;
             resetCounter(uuid);
-            setTimestamp(uuid, timestamp);
+            setTimestampAndVersion(uuid, last_timestamp);
             setVariant(uuid);
         }
         else
         {
-            UUIDHelpers::getHighBytes(uuid) = (timestamp << 16) | 0x7000 | (counter >> counter_low_bits_count);
+            UUIDHelpers::getHighBytes(uuid) = (last_timestamp << 16) | 0x7000 | (counter >> counter_low_bits_count);
             UUIDHelpers::getLowBytes(uuid) = (UUIDHelpers::getLowBytes(uuid) & rand_b_with_counter_bits_mask) | variant_2_mask | ((counter & counter_low_bits_mask) << rand_b_low_bits_count);
         }
     }
 
-    void generate(UUID& new_uuid, uint64_t unix_time_ms)
+    void generate(UUID & uuid, uint64_t timestamp)
     {
-        const bool need_to_increment_counter = (timestamp == unix_time_ms) || ((timestamp > unix_time_ms) & (timestamp < unix_time_ms + 10000));
+        const bool need_to_increment_counter = (last_timestamp == timestamp) || ((last_timestamp > timestamp) & (last_timestamp < timestamp + 10000));
         if (need_to_increment_counter)
         {
-            incrementCounter(new_uuid);
+            incrementCounter(uuid);
         }
         else
         {
-            timestamp = unix_time_ms;
-            resetCounter(new_uuid);
-            setTimestamp(new_uuid, timestamp);
-            setVariant(new_uuid);
+            last_timestamp = timestamp;
+            resetCounter(uuid);
+            setTimestampAndVersion(uuid, last_timestamp);
+            setVariant(uuid);
         }
     }
 };
 
 
-struct CounterDataCommon
-{
-    CounterFields& fields;
-    explicit CounterDataCommon(CounterFields & f)
-        : fields(f)
-    {}
-
-    void generate(UUID& uuid, uint64_t ts)
-    {
-        fields.generate(uuid, ts);
-    }
-};
-
-struct ThreadLocalCounter
-{
-    static constexpr auto name = "generateUUIDv7WithFastCounter";
-    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotony at one timestamp is guaraneed only within one thread running generateUUIDv7 function.";
-
-    struct Data : CounterDataCommon
-    {
-        // Implement counter monotony only within one thread so function doesn't require mutexes and doesn't affect performance of the same function running simultenaously on other threads
-        static inline thread_local CounterFields thread_local_fields;
-        Data()
-            : CounterDataCommon(thread_local_fields)
-        {
-        }
-    };
-};
-
-struct GlobalCounter
+struct GlobalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7";
-    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotony at one timestamp is guaraneed across all generateUUIDv7 functions running simultaneously.";
+    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotonocity at one timestamp is guaraneed across all generateUUIDv7 functions running simultaneously.";
 
-    struct Data : CounterDataCommon
+    /// Guarantee counter monotonocity within one timestamp across all threads generating UUIDv7 simultaneously.
+    struct Data
     {
-        // Implement counter monotony within one timestamp across all threads generating UUIDv7 with counter simultaneously
-        static inline CounterFields static_fields;
-        static inline SharedMutex mtx;
-        std::lock_guard<SharedMutex> guard; // SharedMutex works a little bit faster than std::mutex here
+        static inline CounterFields fields;
+        static inline SharedMutex mutex; /// works a little bit faster than std::mutex here
+        std::lock_guard<SharedMutex> guard;
+
         Data()
-            : CounterDataCommon(static_fields)
-            , guard(mtx)
+            : guard(mutex)
+        {}
+
+        void generate(UUID & uuid, uint64_t timestamp)
         {
+            fields.generate(uuid, timestamp);
         }
     };
 };
+
+struct ThreadLocalCounterPolicy
+{
+    static constexpr auto name = "generateUUIDv7ThreadMonotonic";
+    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotonocity at one timestamp is guaraneed only within one thread running generateUUIDv7 function.";
+
+    /// Guarantee counter monotonocity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads.
+    struct Data
+    {
+        static inline thread_local CounterFields fields;
+
+        void generate(UUID & uuid, uint64_t timestamp)
+        {
+            fields.generate(uuid, timestamp);
+        }
+    };
+};
+
 }
 
 #define DECLARE_SEVERAL_IMPLEMENTATIONS(...) \
@@ -196,10 +185,7 @@ template <typename FillPolicy>
 class FunctionGenerateUUIDv7Base : public IFunction, public FillPolicy
 {
 public:
-    using FillPolicy::name;
-    using FillPolicyData = typename FillPolicy::Data;
-
-    String getName() const final {  return name; }
+    String getName() const final {  return FillPolicy::name; }
 
     size_t getNumberOfArguments() const final { return 0; }
     bool isDeterministic() const override { return false; }
@@ -208,14 +194,13 @@ public:
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const final { return false; }
     bool isVariadic() const final { return true; }
 
-    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
     {
-        if (arguments.size() > 1)
-            throw Exception(
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
-                "Number of arguments for function {} doesn't match: passed {}, should be 0 or 1.",
-                getName(),
-                arguments.size());
+        FunctionArgumentDescriptors mandatory_args;
+        FunctionArgumentDescriptors optional_args{
+            {"expr", nullptr, nullptr, "Arbitrary Expression"}
+        };
+        validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
 
         return std::make_shared<DataTypeUUID>();
     }
@@ -225,18 +210,20 @@ public:
         auto col_res = ColumnVector<UUID>::create();
         typename ColumnVector<UUID>::Container & vec_to = col_res->getData();
 
-        size_t size = input_rows_count;
-        if (size)
+        if (input_rows_count)
         {
-            vec_to.resize(size);
+            vec_to.resize(input_rows_count);
 
             /// Not all random bytes produced here are required for the UUIDv7 but it's the simplest way to get the required number of them by using RandImpl
             RandImpl::execute(reinterpret_cast<char *>(vec_to.data()), vec_to.size() * sizeof(UUID));
-            auto ts = getTimestampMillisecond();
-            for (UUID & new_uuid : vec_to)
+
+            /// Note: For performance reasons, clock_gettime is called once per chunk instead of once per UUID. This reduces precision but
+            /// it still complies with the UUID standard.
+            uint64_t timestamp = getTimestampMillisecond();
+            for (UUID & uuid : vec_to)
             {
-                FillPolicyData data;
-                data.generate(new_uuid, ts);
+                typename FillPolicy::Data data;
+                data.generate(uuid, timestamp);
             }
         }
         return col_res;
@@ -256,10 +243,10 @@ public:
     {
         selector.registerImplementation<TargetArch::Default, Parent>();
 
-    #if USE_MULTITARGET_CODE
+#if USE_MULTITARGET_CODE
         using ParentAVX2 = TargetSpecific::AVX2::FunctionGenerateUUIDv7Base<FillPolicy>;
         selector.registerImplementation<TargetArch::AVX2, ParentAVX2>();
-    #endif
+#endif
     }
 
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
@@ -289,14 +276,15 @@ void registerUUIDv7Generator(auto& factory)
     FunctionDocumentation::ReturnedValue doc_returned_value = "A value of type UUID version 7.";
     FunctionDocumentation::Examples doc_examples = {{"uuid", fmt::format(example_format, FillPolicy::name), ""}, {"multiple", fmt::format(multiple_example_format, fmt::arg("f", FillPolicy::name)), ""}};
     FunctionDocumentation::Categories doc_categories = {"UUID"};
+
     factory.template registerFunction<FunctionGenerateUUIDv7Base<FillPolicy>>({doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::CaseInsensitive);
 }
 
 REGISTER_FUNCTION(GenerateUUIDv7)
 {
-    registerUUIDv7Generator<GlobalCounter>(factory);
+    registerUUIDv7Generator<GlobalCounterPolicy>(factory);
+    registerUUIDv7Generator<ThreadLocalCounterPolicy>(factory);
     registerUUIDv7Generator<FillAllRandomPolicy>(factory);
-    registerUUIDv7Generator<ThreadLocalCounter>(factory);
+}
 }
 
-}
diff --git a/src/Functions/locate.cpp b/src/Functions/locate.cpp
index 9a70fbb2d72..d9a727ab3ef 100644
--- a/src/Functions/locate.cpp
+++ b/src/Functions/locate.cpp
@@ -19,7 +19,7 @@ using FunctionLocate = FunctionsStringSearch<PositionImpl<NameLocate, PositionCa
 
 REGISTER_FUNCTION(Locate)
 {
-    FunctionDocumentation::Description doc_description = "Like function `position` but with arguments `haystack` and `locate` switched. The behavior of this function depends on the ClickHouse version: In versions < v24.3, `locate` was an alias of function `position` and accepted arguments `(haystack, needle[, start_pos])`. In versions >= 24.3,, `locate` is an individual function (for better compatibility with MySQL) and accepts arguments `(needle, haystack[, start_pos])`. The previous behaviorcan be restored using setting `function_locate_has_mysql_compatible_argument_order = false`.";
+    FunctionDocumentation::Description doc_description = "Like function `position` but with arguments `haystack` and `locate` switched. The behavior of this function depends on the ClickHouse version: In versions < v24.3, `locate` was an alias of function `position` and accepted arguments `(haystack, needle[, start_pos])`. In versions >= 24.3,, `locate` is an individual function (for better compatibility with MySQL) and accepts arguments `(needle, haystack[, start_pos])`. The previous behavior can be restored using setting `function_locate_has_mysql_compatible_argument_order = false`.";
     FunctionDocumentation::Syntax doc_syntax = "location(needle, haystack[, start_pos])";
     FunctionDocumentation::Arguments doc_arguments = {{"needle", "Substring to be searched (String)"},
                                                       {"haystack", "String in which the search is performed (String)."},
diff --git a/tests/queries/0_stateless/00396_uuid_v7.sql b/tests/queries/0_stateless/00396_uuid_v7.sql
index 9ba2aaa58e9..7c1449dd620 100644
--- a/tests/queries/0_stateless/00396_uuid_v7.sql
+++ b/tests/queries/0_stateless/00396_uuid_v7.sql
@@ -1,6 +1,6 @@
-select '-- UUIDToNum --';
-select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1);
-select UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2);
+SELECT '-- UUIDToNum --';
+SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1);
+SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2);
 
-select '-- UUIDv7toDateTime --';
-select UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
+SELECT '-- UUIDv7toDateTime --';
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
diff --git a/tests/queries/0_stateless/02310_uuid_v7.reference b/tests/queries/0_stateless/02310_uuid_v7.reference
index 149a9018d4e..ca4150bded2 100644
--- a/tests/queries/0_stateless/02310_uuid_v7.reference
+++ b/tests/queries/0_stateless/02310_uuid_v7.reference
@@ -5,7 +5,7 @@ UUID
 0
 0
 1
--- generateUUIDv7WithFastCounter --
+-- generateUUIDv7ThreadMonotonic --
 UUID
 7
 2
diff --git a/tests/queries/0_stateless/02310_uuid_v7.sql b/tests/queries/0_stateless/02310_uuid_v7.sql
index 6f80cfbd921..0f12de07d20 100644
--- a/tests/queries/0_stateless/02310_uuid_v7.sql
+++ b/tests/queries/0_stateless/02310_uuid_v7.sql
@@ -1,23 +1,23 @@
 SELECT '-- generateUUIDv7 --';
 SELECT toTypeName(generateUUIDv7());
-SELECT substring(hex(generateUUIDv7()), 13, 1); -- extract version
-SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7()), 62), 3); -- extract variant
+SELECT substring(hex(generateUUIDv7()), 13, 1); -- check version bits
+SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7()), 62), 3); -- check variant bits
 SELECT generateUUIDv7(1) = generateUUIDv7(2);
 SELECT generateUUIDv7() = generateUUIDv7(1);
 SELECT generateUUIDv7(1) = generateUUIDv7(1);
 
-SELECT '-- generateUUIDv7WithFastCounter --';
-SELECT toTypeName(generateUUIDv7WithFastCounter());
-SELECT substring(hex(generateUUIDv7WithFastCounter()), 13, 1); -- extract version
-SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7WithFastCounter()), 62), 3); -- extract variant
-SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(2);
-SELECT generateUUIDv7WithFastCounter() = generateUUIDv7WithFastCounter(1);
-SELECT generateUUIDv7WithFastCounter(1) = generateUUIDv7WithFastCounter(1);
+SELECT '-- generateUUIDv7ThreadMonotonic --';
+SELECT toTypeName(generateUUIDv7ThreadMonotonic());
+SELECT substring(hex(generateUUIDv7ThreadMonotonic()), 13, 1); -- check version bits
+SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7ThreadMonotonic()), 62), 3); -- check variant bits
+SELECT generateUUIDv7ThreadMonotonic(1) = generateUUIDv7ThreadMonotonic(2);
+SELECT generateUUIDv7ThreadMonotonic() = generateUUIDv7ThreadMonotonic(1);
+SELECT generateUUIDv7ThreadMonotonic(1) = generateUUIDv7ThreadMonotonic(1);
 
 SELECT '-- generateUUIDv7NonMonotonic --';
 SELECT toTypeName(generateUUIDv7NonMonotonic());
-SELECT substring(hex(generateUUIDv7NonMonotonic()), 13, 1); -- extract version
-SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7NonMonotonic()), 62), 3); -- extract variant
+SELECT substring(hex(generateUUIDv7NonMonotonic()), 13, 1); -- check version bits
+SELECT bitAnd(bitShiftRight(toUInt128(generateUUIDv7NonMonotonic()), 62), 3); -- check variant bits
 SELECT generateUUIDv7NonMonotonic(1) = generateUUIDv7NonMonotonic(2);
 SELECT generateUUIDv7NonMonotonic() = generateUUIDv7NonMonotonic(1);
 SELECT generateUUIDv7NonMonotonic(1) = generateUUIDv7NonMonotonic(1);

From 3e4d9bf73abc072c2cb33b462cee5527212f71a1 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Sun, 28 Apr 2024 14:04:16 +0200
Subject: [PATCH 081/192] Better result condition

---
 src/Interpreters/Cache/IFileCachePriority.h     |  8 ++++++--
 src/Interpreters/Cache/LRUFileCachePriority.cpp | 11 +++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/Interpreters/Cache/IFileCachePriority.h b/src/Interpreters/Cache/IFileCachePriority.h
index 519e58fb107..bb7b220c1f9 100644
--- a/src/Interpreters/Cache/IFileCachePriority.h
+++ b/src/Interpreters/Cache/IFileCachePriority.h
@@ -137,7 +137,8 @@ public:
 
     virtual PriorityDumpPtr dump(const CachePriorityGuard::Lock &) = 0;
 
-    /// Collect eviction candidates sufficient to free `size` bytes.
+    /// Collect eviction candidates sufficient to free `size` bytes
+    /// and `elements` elements from cache.
     virtual bool collectCandidatesForEviction(
         size_t size,
         size_t elements,
@@ -147,7 +148,10 @@ public:
         const UserID & user_id,
         const CachePriorityGuard::Lock &) = 0;
 
-    /// Collect eviction `candidates_num` candidates for eviction.
+    /// Collect eviction candidates sufficient to have `desired_size`
+    /// and `desired_elements_num` as current cache state.
+    /// Collect no more than `max_candidates_to_evict` elements.
+    /// Return `true` if the first condition is satisfied.
     virtual bool collectCandidatesForEviction(
         size_t desired_size,
         size_t desired_elements_count,
diff --git a/src/Interpreters/Cache/LRUFileCachePriority.cpp b/src/Interpreters/Cache/LRUFileCachePriority.cpp
index c1ff9ce17ba..ec96eb14a8a 100644
--- a/src/Interpreters/Cache/LRUFileCachePriority.cpp
+++ b/src/Interpreters/Cache/LRUFileCachePriority.cpp
@@ -331,14 +331,17 @@ bool LRUFileCachePriority::collectCandidatesForEviction(
     EvictionCandidates & res,
     const CachePriorityGuard::Lock & lock)
 {
-    auto stop_condition = [&, this]()
+    auto desired_limits_satisfied = [&]()
     {
         return canFit(0, 0, stat.total_stat.releasable_size, stat.total_stat.releasable_count,
-                      lock, &desired_size, &desired_elements_count)
-            || (max_candidates_to_evict && res.size() >= max_candidates_to_evict);
+                      lock, &desired_size, &desired_elements_count);
+    };
+    auto stop_condition = [&]()
+    {
+        return desired_limits_satisfied() || (max_candidates_to_evict && res.size() >= max_candidates_to_evict);
     };
     iterateForEviction(res, stat, stop_condition, lock);
-    return stop_condition();
+    return desired_limits_satisfied();
 }
 
 void LRUFileCachePriority::iterateForEviction(

From 4b7abbce68d65f51e316501d4fdb6fc85421208d Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Mon, 29 Apr 2024 01:33:48 +0000
Subject: [PATCH 082/192] fix abs monotonicity

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Functions/abs.cpp                                       | 2 +-
 src/Storages/MergeTree/KeyCondition.cpp                     | 4 ++--
 .../0_stateless/03130_abs_in_key_condition_bug.reference    | 2 ++
 .../queries/0_stateless/03130_abs_in_key_condition_bug.sql  | 6 ++++++
 4 files changed, 11 insertions(+), 3 deletions(-)
 create mode 100644 tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference
 create mode 100644 tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql

diff --git a/src/Functions/abs.cpp b/src/Functions/abs.cpp
index 25ff6cc93d0..0cd313caf1e 100644
--- a/src/Functions/abs.cpp
+++ b/src/Functions/abs.cpp
@@ -45,7 +45,7 @@ template <> struct FunctionUnaryArithmeticMonotonicity<NameAbs>
         if ((left_float < 0 && right_float > 0) || (left_float > 0 && right_float < 0))
             return {};
 
-        return { .is_monotonic = true, .is_positive = left_float > 0, .is_strict = true, };
+        return { .is_monotonic = true, .is_positive = std::min(left_float, right_float) >= 0, .is_strict = true, };
     }
 };
 
diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp
index 2d57ea40c9c..304bfb567c3 100644
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@@ -2860,9 +2860,9 @@ bool KeyCondition::mayBeTrueInRange(
 String KeyCondition::RPNElement::toString() const
 {
     if (argument_num_of_space_filling_curve)
-        return toString(fmt::format("argument {} of column {}", *argument_num_of_space_filling_curve, key_column), false);
+        return toString(fmt::format("argument {} of column {}", *argument_num_of_space_filling_curve, key_column), true);
     else
-        return toString(fmt::format("column {}", key_column), false);
+        return toString(fmt::format("column {}", key_column), true);
 }
 
 String KeyCondition::RPNElement::toString(std::string_view column_name, bool print_constants) const
diff --git a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference
new file mode 100644
index 00000000000..9cf7186540c
--- /dev/null
+++ b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference
@@ -0,0 +1,2 @@
+1	2023-05-04 22:17:23	0
+2	2023-05-04 22:17:23	0
diff --git a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
new file mode 100644
index 00000000000..d9e9ea6fa50
--- /dev/null
+++ b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
@@ -0,0 +1,6 @@
+CREATE TABLE t (id UInt64, ts DateTime) ENGINE = MergeTree() ORDER BY (id, ts) SETTINGS index_granularity = 2;
+
+INSERT INTO t VALUES (1, '2023-05-04 21:17:23') (1, '2023-05-04 22:17:23') (2, '2023-05-04 22:17:23') (2, '2023-05-04 23:17:23');
+
+SELECT *, abs(toUnixTimestamp(ts) - toUnixTimestamp(1683238643)) AS error FROM t WHERE error < 3600;
+

From 5cb55ca1b623dbaec293abaab1601648c8f44f85 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Mon, 29 Apr 2024 05:43:30 +0000
Subject: [PATCH 083/192] drop table in test

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
index d9e9ea6fa50..3b782f82c39 100644
--- a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
+++ b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
@@ -1,6 +1,9 @@
+DROP TABLE IF EXISTS t;
+
 CREATE TABLE t (id UInt64, ts DateTime) ENGINE = MergeTree() ORDER BY (id, ts) SETTINGS index_granularity = 2;
 
 INSERT INTO t VALUES (1, '2023-05-04 21:17:23') (1, '2023-05-04 22:17:23') (2, '2023-05-04 22:17:23') (2, '2023-05-04 23:17:23');
 
 SELECT *, abs(toUnixTimestamp(ts) - toUnixTimestamp(1683238643)) AS error FROM t WHERE error < 3600;
 
+DROP TABLE IF EXISTS t;

From 5946ff0633e8f4ce86bde48e8a10f3dfab46d8d4 Mon Sep 17 00:00:00 2001
From: peter279k <peter279k@gmail.com>
Date: Mon, 29 Apr 2024 15:33:46 +0800
Subject: [PATCH 084/192] Improve PR#62592

---
 .../sql-reference/functions/string-functions.md  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md
index 33a0c6d6b46..e6703b573cb 100644
--- a/docs/en/sql-reference/functions/string-functions.md
+++ b/docs/en/sql-reference/functions/string-functions.md
@@ -1147,13 +1147,13 @@ tryBase58Decode(encoded)
 Query:
 
 ```sql
-SELECT tryBase58Decode('3dc8KtHrwM') as res;
+SELECT tryBase58Decode('3dc8KtHrwM') as res, tryBase58Decode('invalid') as res_invalid;
 ```
 
 ```response
-┌─res─────┐
-│ Encoded │
-└─────────┘
+┌─res─────┬─res_invalid─┐
+│ Encoded │             │
+└─────────┴─────────────┘
 ```
 
 ## base64Encode
@@ -1187,13 +1187,13 @@ tryBase64Decode(encoded)
 Query:
 
 ```sql
-SELECT tryBase64Decode('RW5jb2RlZA==') as res;
+SELECT tryBase64Decode('RW5jb2RlZA==') as res, tryBase64Decode('invalid') as res_invalid;
 ```
 
 ```response
-┌─res─────┐
-│ Encoded │
-└─────────┘
+┌─res─────┬─res_invalid─┐
+│ Encoded │             │
+└─────────┴─────────────┘
 ```
 
 ## endsWith {#endswith}

From 046ab54d0bf887dfbebf3557c9eeb70def842f56 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Mon, 29 Apr 2024 07:36:25 +0000
Subject: [PATCH 085/192] try fix test to avoid timezone dependency

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../0_stateless/03130_abs_in_key_condition_bug.reference     | 3 +--
 tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql | 5 +++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference
index 9cf7186540c..0cfbf08886f 100644
--- a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference
+++ b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.reference
@@ -1,2 +1 @@
-1	2023-05-04 22:17:23	0
-2	2023-05-04 22:17:23	0
+2
diff --git a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
index 3b782f82c39..eec0d034d10 100644
--- a/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
+++ b/tests/queries/0_stateless/03130_abs_in_key_condition_bug.sql
@@ -2,8 +2,9 @@ DROP TABLE IF EXISTS t;
 
 CREATE TABLE t (id UInt64, ts DateTime) ENGINE = MergeTree() ORDER BY (id, ts) SETTINGS index_granularity = 2;
 
-INSERT INTO t VALUES (1, '2023-05-04 21:17:23') (1, '2023-05-04 22:17:23') (2, '2023-05-04 22:17:23') (2, '2023-05-04 23:17:23');
+INSERT INTO t VALUES
+    (1, toDateTime('2023-05-04 21:17:23', 'UTC')), (1, toDateTime('2023-05-04 22:17:23', 'UTC')), (2, toDateTime('2023-05-04 22:17:23', 'UTC')), (2, toDateTime('2023-05-04 23:17:23', 'UTC'));
 
-SELECT *, abs(toUnixTimestamp(ts) - toUnixTimestamp(1683238643)) AS error FROM t WHERE error < 3600;
+SELECT  count(abs(toUnixTimestamp(ts, 'UTC') - toUnixTimestamp('2023-05-04 22:17:23', 'UTC')) AS error) FROM t WHERE error < 3600;
 
 DROP TABLE IF EXISTS t;

From 979efcad494f98360095ec810ede14f408f1a6b5 Mon Sep 17 00:00:00 2001
From: Eduard Karacharov <eduard.karacharov@semrush.com>
Date: Mon, 29 Apr 2024 12:42:56 +0300
Subject: [PATCH 086/192] allow YAML sources in include_from

---
 src/Common/Config/ConfigProcessor.cpp          | 14 +++++++++++++-
 .../configs/config_include_from_yml.xml        | 18 ++++++++++++++++++
 .../configs/include_from_source.yml            | 10 ++++++++++
 .../test_config_substitutions/test.py          | 14 ++++++++++++++
 4 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 tests/integration/test_config_substitutions/configs/config_include_from_yml.xml
 create mode 100644 tests/integration/test_config_substitutions/configs/include_from_source.yml

diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp
index 60407c6a174..7930ef20153 100644
--- a/src/Common/Config/ConfigProcessor.cpp
+++ b/src/Common/Config/ConfigProcessor.cpp
@@ -731,8 +731,20 @@ XMLDocumentPtr ConfigProcessor::processConfig(
         {
             LOG_DEBUG(log, "Including configuration file '{}'.", include_from_path);
 
+            fs::path p(include_from_path);
+            std::string extension = p.extension();
+            boost::algorithm::to_lower(extension);
+
+            if (extension == ".yaml" || extension == ".yml")
+            {
+                include_from = YAMLParser::parse(include_from_path);
+            }
+            else
+            {
+                include_from = dom_parser.parse(include_from_path);
+            }
+
             contributing_files.push_back(include_from_path);
-            include_from = dom_parser.parse(include_from_path);
         }
 
         doIncludesRecursive(config, include_from, getRootNode(config.get()), zk_node_cache, zk_changed_event, contributing_zk_paths);
diff --git a/tests/integration/test_config_substitutions/configs/config_include_from_yml.xml b/tests/integration/test_config_substitutions/configs/config_include_from_yml.xml
new file mode 100644
index 00000000000..9662653ad9c
--- /dev/null
+++ b/tests/integration/test_config_substitutions/configs/config_include_from_yml.xml
@@ -0,0 +1,18 @@
+<clickhouse>
+  <include_from>/etc/clickhouse-server/config.d/include_from_source.yml</include_from>
+  <profiles>
+    <default>
+        <max_query_size incl="mqs" />
+    </default>
+  </profiles>
+  <users>
+      <default>
+          <password></password>
+          <profile>default</profile>
+          <quota>default</quota>
+      </default>
+
+      <include incl="users_1" />
+      <include incl="users_2" />
+  </users>
+</clickhouse>
diff --git a/tests/integration/test_config_substitutions/configs/include_from_source.yml b/tests/integration/test_config_substitutions/configs/include_from_source.yml
new file mode 100644
index 00000000000..6367baf72e9
--- /dev/null
+++ b/tests/integration/test_config_substitutions/configs/include_from_source.yml
@@ -0,0 +1,10 @@
+---
+  mqs: 88888
+  users_1:
+    user_1:
+      password: ""
+      profile: default
+  users_2:
+    user_2:
+      password: ""
+      profile: default
diff --git a/tests/integration/test_config_substitutions/test.py b/tests/integration/test_config_substitutions/test.py
index ac75771cb9c..faceab6fbcd 100644
--- a/tests/integration/test_config_substitutions/test.py
+++ b/tests/integration/test_config_substitutions/test.py
@@ -49,6 +49,11 @@ node7 = cluster.add_instance(
     },
     instance_env_variables=True,
 )
+node8 = cluster.add_instance(
+    "node8",
+    user_configs=["configs/config_include_from_yml.xml"],
+    main_configs=["configs/include_from_source.yml"],
+)
 
 
 @pytest.fixture(scope="module")
@@ -115,6 +120,10 @@ def test_config(start_cluster):
         node7.query("select value from system.settings where name = 'max_threads'")
         == "2\n"
     )
+    assert (
+        node8.query("select value from system.settings where name = 'max_query_size'")
+        == "88888\n"
+    )
 
 
 def test_config_invalid_overrides(start_cluster):
@@ -183,6 +192,11 @@ def test_include_config(start_cluster):
     assert node3.query("select 1", user="user_1")
     assert node3.query("select 1", user="user_2")
 
+    # <include incl="source tag" /> from .yml source
+    assert node8.query("select 1")
+    assert node8.query("select 1", user="user_1")
+    assert node8.query("select 1", user="user_2")
+
 
 def test_allow_databases(start_cluster):
     node5.query("CREATE DATABASE db1")

From 437621cbe5e1eef09c8affcdb4d84d4c8431f0bd Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 25 Apr 2024 13:37:33 +0200
Subject: [PATCH 087/192] Remove superfluous condition from
 ClientBase::sendExternalTables()

send_external_tables already checked by the callers.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 src/Client/ClientBase.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index 8d78c340626..bd4430648c5 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -439,8 +439,7 @@ void ClientBase::sendExternalTables(ASTPtr parsed_query)
     for (auto & table : external_tables)
         data.emplace_back(table.getData(global_context));
 
-    if (send_external_tables)
-        connection->sendExternalTablesData(data);
+    connection->sendExternalTablesData(data);
 }
 
 

From 119e000873d5c80005f878bb5e38da0d88a77548 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 25 Apr 2024 13:38:47 +0200
Subject: [PATCH 088/192] Fix subsequent use of external tables in client
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this patch it fails on the second and all other invocations:

    $ chc --external --file=/tmp/test-no-lf.tsv --name=t --structure='x String'
    azat.local$ select * from t limit 1
       ┌─x───┐
    1. │ foo │
       └─────┘

    azat.local$ select * from t limit 1
    Exception on client:
    Code: 27. DB::Exception: Cannot parse input: expected '\t' at end of stream.: (at row 1)
    : Buffer has gone, cannot extract information about what has been parsed.: (in file/uri /tmp/test-no-lf.tsv): While executing TabSeparatedRowInputFormat. (CANNOT_PARSE_INPUT_ASSERTION_FAILED)

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 src/Core/ExternalTable.cpp                        |  3 +++
 ...40_client_subsequent_external_tables.reference |  2 ++
 .../03140_client_subsequent_external_tables.sh    | 15 +++++++++++++++
 3 files changed, 20 insertions(+)
 create mode 100644 tests/queries/0_stateless/03140_client_subsequent_external_tables.reference
 create mode 100755 tests/queries/0_stateless/03140_client_subsequent_external_tables.sh

diff --git a/src/Core/ExternalTable.cpp b/src/Core/ExternalTable.cpp
index f8bbd16d038..bc72c996384 100644
--- a/src/Core/ExternalTable.cpp
+++ b/src/Core/ExternalTable.cpp
@@ -106,6 +106,9 @@ void BaseExternalTable::parseStructureFromTypesField(const std::string & argumen
 
 void BaseExternalTable::initSampleBlock()
 {
+    if (sample_block)
+        return;
+
     const DataTypeFactory & data_type_factory = DataTypeFactory::instance();
 
     for (const auto & elem : structure)
diff --git a/tests/queries/0_stateless/03140_client_subsequent_external_tables.reference b/tests/queries/0_stateless/03140_client_subsequent_external_tables.reference
new file mode 100644
index 00000000000..0d55bed3a35
--- /dev/null
+++ b/tests/queries/0_stateless/03140_client_subsequent_external_tables.reference
@@ -0,0 +1,2 @@
+foo
+foo
diff --git a/tests/queries/0_stateless/03140_client_subsequent_external_tables.sh b/tests/queries/0_stateless/03140_client_subsequent_external_tables.sh
new file mode 100755
index 00000000000..af75bf42172
--- /dev/null
+++ b/tests/queries/0_stateless/03140_client_subsequent_external_tables.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+INPUT_FILE=$CUR_DIR/$CLICKHOUSE_DATABASE.tsv
+echo "foo" > "$INPUT_FILE"
+
+$CLICKHOUSE_CLIENT --external --file="$INPUT_FILE" --name=t --structure='x String' -nm -q "
+select * from t;
+select * from t;
+"
+
+rm "${INPUT_FILE:?}"

From 084f917bf8c63cbef29b700c65546ccf7559d6a8 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Mon, 29 Apr 2024 10:37:45 +0000
Subject: [PATCH 089/192] fix clang-tidy and better test

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../RocksDB/StorageEmbeddedRocksDB.cpp        |  2 --
 .../02956_rocksdb_bulk_sink.reference         |  6 +++--
 .../0_stateless/02956_rocksdb_bulk_sink.sql   | 24 ++++++++++++++-----
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
index af057b817d4..ad185f8ec2b 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
@@ -45,8 +45,6 @@
 #include <utility>
 
 
-namespace fs = std::filesystem;
-
 namespace DB
 {
 
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
index 6232e00f9c9..f8cd87238a8 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
@@ -1,4 +1,6 @@
-1
-1000
 0
 1000
+1
+1000
+2
+1000000
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
index 34795546f7c..bfe1c3eaceb 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
@@ -1,10 +1,22 @@
 -- Tags: no-ordinary-database, use-rocksdb
-CREATE TABLE IF NOT EXISTS rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 1;
-INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);
-SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 1
-SELECT count() FROM rocksdb_worm;
-TRUNCATE TABLE rocksdb_worm;
-ALTER TABLE rocksdb_worm MODIFY SETTING optimize_for_bulk_insert = 0;
+
+-- Normal importing, as we only insert 1000 rows, so it should be in memtable
+CREATE TABLE IF NOT EXISTS rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 0;
 INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);
 SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 0 because all data is still in memtable
 SELECT count() FROM rocksdb_worm;
+
+-- With bulk insertion, there is no memtable, so a small insert should create a new file
+ALTER TABLE rocksdb_worm MODIFY SETTING optimize_for_bulk_insert = 1;
+TRUNCATE TABLE rocksdb_worm;
+INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);
+SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 1
+SELECT count() FROM rocksdb_worm;
+
+-- Testing insert with multiple sinks and fixed block size
+TRUNCATE TABLE rocksdb_worm;
+ALTER TABLE rocksdb_worm MODIFY SETTING bulk_insert_block_size = 500000;
+INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000000) SETTINGS max_insert_threads = 2, max_block_size = 100000;
+SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 2 as max_block_size is set to 500000
+SELECT count() FROM rocksdb_worm;
+

From 98d606298b4221af8d33bbea1db4171d17595165 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Mon, 22 Apr 2024 10:01:44 +0000
Subject: [PATCH 090/192] better way to deduplicate keys while creating sst
 files

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../RocksDB/EmbeddedRocksDBBulkSink.cpp       | 25 +++++++-----
 .../02956_rocksdb_bulk_sink.reference         |  3 ++
 .../0_stateless/02956_rocksdb_bulk_sink.sh    | 40 +++++++++++++++++++
 .../0_stateless/02956_rocksdb_bulk_sink.sql   | 22 ----------
 4 files changed, 57 insertions(+), 33 deletions(-)
 create mode 100755 tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
 delete mode 100644 tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index dbaa5a8afea..b58e0c5eb4a 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -16,6 +16,7 @@
 #include <IO/WriteHelpers.h>
 #include <Interpreters/Context.h>
 #include <rocksdb/options.h>
+#include <rocksdb/slice.h>
 #include <rocksdb/status.h>
 #include <rocksdb/utilities/db_ttl.h>
 #include <Common/SipHash.h>
@@ -54,20 +55,22 @@ static rocksdb::Status buildSSTFile(const String & path, const ColumnString & ke
         return status;
 
     auto rows = perm.size();
-    for (size_t i = 0; i < rows; ++i)
+    for (size_t idx = 0; idx < rows;)
     {
-        auto row = perm[i];
+        /// We will write the last row of the same key
+        size_t next_idx = idx + 1;
+        while (next_idx < rows && keys.compareAt(perm[idx], perm[next_idx], keys, 1) == 0)
+            ++next_idx;
 
+        auto row = perm[next_idx - 1];
         status = sst_file_writer.Put(keys.getDataAt(row).toView(), values.getDataAt(row).toView());
-
-        /// There could be duplicated keys in chunk, thus Put may give IsInvalidArgument. This is ok, as we're certain that
-        /// keys are sorted in ascending order.
-        if (!status.ok() && !status.IsInvalidArgument())
+        if (!status.ok())
             return status;
+
+        idx = next_idx;
     }
 
-    sst_file_writer.Finish();
-    return rocksdb::Status::OK();
+    return sst_file_writer.Finish();
 }
 
 EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink(
@@ -99,9 +102,9 @@ EmbeddedRocksDBBulkSink::~EmbeddedRocksDBBulkSink()
         if (fs::exists(insert_directory_queue))
             fs::remove_all(insert_directory_queue);
     }
-    catch (...)
+    catch(...)
     {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
+        tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Error while removing temporary directory {}:", insert_directory_queue));
     }
 }
 
@@ -204,7 +207,7 @@ void EmbeddedRocksDBBulkSink::consume(Chunk chunk_)
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
 
     /// Ingest the SST file
-    rocksdb::IngestExternalFileOptions ingest_options;
+    static rocksdb::IngestExternalFileOptions ingest_options;
     ingest_options.move_files = true; /// The temporary file is on the same disk, so move (or hardlink) file will be faster than copy
     if (auto status = storage.rocksdb_ptr->IngestExternalFile({sst_file_path}, ingest_options); !status.ok())
         throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
index f8cd87238a8..dcf8a322ed5 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
@@ -4,3 +4,6 @@
 1000
 2
 1000000
+1000
+0	999001
+1000000
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
new file mode 100755
index 00000000000..9f771b0fcb4
--- /dev/null
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+# Tags: no-ordinary-database, use-rocksdb
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+# Normal importing, as we only insert 1000 rows, so it should be in memtable
+${CLICKHOUSE_CLIENT} --query "CREATE TABLE IF NOT EXISTS rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 0;"
+${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);"
+${CLICKHOUSE_CLIENT} --query "SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens';" # should be 0 because all data is still in memtable
+${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
+
+# With bulk insertion, there is no memtable, so a small insert should create a new file
+${CLICKHOUSE_CLIENT} --query "ALTER TABLE rocksdb_worm MODIFY SETTING optimize_for_bulk_insert = 1;"
+${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE rocksdb_worm;"
+${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);"
+${CLICKHOUSE_CLIENT} --query "SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens';" # should be 1
+${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
+
+# Testing insert with multiple sinks and fixed block size
+${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE rocksdb_worm;"
+${CLICKHOUSE_CLIENT} --query "ALTER TABLE rocksdb_worm MODIFY SETTING bulk_insert_block_size = 500000;"
+${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000000) SETTINGS max_insert_threads = 2, max_block_size = 100000;"
+${CLICKHOUSE_CLIENT} --query "SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens';" # should be 2 as max_block_size is set to 500000
+${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
+
+# Testing insert with duplicated keys
+${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE rocksdb_worm;"
+${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number % 1000, number+1 FROM numbers_mt(1000000) SETTINGS max_block_size = 100000, max_insert_threads = 1;"
+${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
+${CLICKHOUSE_CLIENT} --query "SELECT * FROM rocksdb_worm WHERE key = 0;" # should be the latest value - 999001
+
+# Testing insert with multiple threads
+${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE rocksdb_worm;"
+${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000000)" &
+${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000000)" &
+wait
+${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
+
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
deleted file mode 100644
index bfe1c3eaceb..00000000000
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sql
+++ /dev/null
@@ -1,22 +0,0 @@
--- Tags: no-ordinary-database, use-rocksdb
-
--- Normal importing, as we only insert 1000 rows, so it should be in memtable
-CREATE TABLE IF NOT EXISTS rocksdb_worm (key UInt64, value UInt64) ENGINE = EmbeddedRocksDB() PRIMARY KEY key SETTINGS optimize_for_bulk_insert = 0;
-INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);
-SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 0 because all data is still in memtable
-SELECT count() FROM rocksdb_worm;
-
--- With bulk insertion, there is no memtable, so a small insert should create a new file
-ALTER TABLE rocksdb_worm MODIFY SETTING optimize_for_bulk_insert = 1;
-TRUNCATE TABLE rocksdb_worm;
-INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);
-SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 1
-SELECT count() FROM rocksdb_worm;
-
--- Testing insert with multiple sinks and fixed block size
-TRUNCATE TABLE rocksdb_worm;
-ALTER TABLE rocksdb_worm MODIFY SETTING bulk_insert_block_size = 500000;
-INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000000) SETTINGS max_insert_threads = 2, max_block_size = 100000;
-SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens'; -- should be 2 as max_block_size is set to 500000
-SELECT count() FROM rocksdb_worm;
-

From 3b0c4dc671b9685444aa75e4a41673d7bea4e4d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 29 Apr 2024 12:56:50 +0200
Subject: [PATCH 091/192] Update SettingsChangesHistory.h

---
 src/Core/SettingsChangesHistory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index 602ae8f2a13..672810d875a 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -94,7 +94,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"first_day_of_week", "Monday", "Monday", "Added a setting for the first day of the week for date/time functions"},
               {"allow_experimental_database_replicated", false, true, "Database engine Replicated is now in Beta stage"},
               {"temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds", (10 * 60 * 1000), (10 * 60 * 1000), "Wait time to lock cache for sapce reservation in temporary data in filesystem cache"},
-              {"optimize_rewrite_sum_if_to_count_if", false, true, "Only available for the analyzer, where it works correctly"}
+              {"optimize_rewrite_sum_if_to_count_if", false, true, "Only available for the analyzer, where it works correctly"},
               {"azure_allow_parallel_part_upload", "true", "true", "Use multiple threads for azure multipart upload."},
               {"max_recursive_cte_evaluation_depth", DBMS_RECURSIVE_CTE_MAX_EVALUATION_DEPTH, DBMS_RECURSIVE_CTE_MAX_EVALUATION_DEPTH, "Maximum limit on recursive CTE evaluation depth"},
               {"query_plan_convert_outer_join_to_inner_join", false, true, "Allow to convert OUTER JOIN to INNER JOIN if filter after JOIN always filters default values"},

From 9ca030fa11fe88d80832135ef38b684ca7ac781d Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Mon, 29 Apr 2024 11:03:18 +0000
Subject: [PATCH 092/192] update documents

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 .../engines/table-engines/integrations/embedded-rocksdb.md   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
index c880ad7253c..1958250ed73 100644
--- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
+++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
@@ -17,7 +17,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
     name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
     ...
 ) ENGINE = EmbeddedRocksDB([ttl, rocksdb_dir, read_only]) PRIMARY KEY(primary_key_name)
-[ SETTINGS optimize_for_bulk_insert = (0|1)]
+[ SETTINGS name=value, ... ]
 ```
 
 Engine parameters:
@@ -32,7 +32,8 @@ Engine parameters:
 
 Engine settings:
 
-- `optimize_for_bulk_insert` – Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing to memtables).
+- `optimize_for_bulk_insert` – Table is optimized for bulk insertions (insert pipeline will create SST files and import to rocksdb database instead of writing to memtables); default value: `1`.
+- `bulk_insert_block_size` - Minimum size of SST files (in term of rows) created by bulk insertion; default value: `1048449`.
 
 Example:
 

From 97c781e365dcb32961c34c0dc619e84ee431296e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 29 Apr 2024 14:34:47 +0200
Subject: [PATCH 093/192] Sanity check: Clamp values instead of throwing

---
 src/Core/SettingsQuirks.cpp                   |  9 ++--
 src/Core/SettingsQuirks.h                     |  4 +-
 src/Interpreters/Context.cpp                  | 52 +++++++++----------
 src/Interpreters/Context.h                    | 20 +++----
 .../02994_sanity_check_settings.reference     | 18 +++++++
 .../02994_sanity_check_settings.sql           | 10 ++--
 6 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp
index ee6fbea3409..6e524c8616f 100644
--- a/src/Core/SettingsQuirks.cpp
+++ b/src/Core/SettingsQuirks.cpp
@@ -95,7 +95,7 @@ void applySettingsQuirks(Settings & settings, LoggerPtr log)
     }
 }
 
-void doSettingsSanityCheck(const Settings & current_settings)
+void doSettingsSanityCheckClamp(Settings & current_settings)
 {
     auto getCurrentValue = [&current_settings](const std::string_view name) -> Field
     {
@@ -106,8 +106,9 @@ void doSettingsSanityCheck(const Settings & current_settings)
     };
 
     UInt64 max_threads = getCurrentValue("max_threads").get<UInt64>();
-    if (max_threads > getNumberOfPhysicalCPUCores() * 65536)
-        throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Sanity check: Too many threads requested ({})", max_threads);
+    UInt64 max_threads_max_value = 256 * getNumberOfPhysicalCPUCores();
+    if (max_threads > max_threads_max_value)
+        current_settings.set("max_threads", max_threads_max_value);
 
     constexpr UInt64 max_sane_block_rows_size = 4294967296; // 2^32
     std::unordered_set<String> block_rows_settings{
@@ -122,7 +123,7 @@ void doSettingsSanityCheck(const Settings & current_settings)
     {
         auto block_size = getCurrentValue(setting).get<UInt64>();
         if (block_size > max_sane_block_rows_size)
-            throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Sanity check: '{}' value is too high ({})", setting, block_size);
+            current_settings.set(setting, max_sane_block_rows_size);
     }
 }
 }
diff --git a/src/Core/SettingsQuirks.h b/src/Core/SettingsQuirks.h
index 9461f5716a8..012d8d4b524 100644
--- a/src/Core/SettingsQuirks.h
+++ b/src/Core/SettingsQuirks.h
@@ -10,6 +10,6 @@ struct Settings;
 /// Update some settings defaults to avoid some known issues.
 void applySettingsQuirks(Settings & settings, LoggerPtr log = nullptr);
 
-/// Verify that some settings have sane values. Throws if not
-void doSettingsSanityCheck(const Settings & settings);
+/// Verify that some settings have sane values. Alters the value to a reasonable one if not
+void doSettingsSanityCheckClamp(Settings & settings);
 }
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 77c49cb432c..c2bb14017e1 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -1374,18 +1374,18 @@ std::shared_ptr<const EnabledRolesInfo> Context::getRolesInfo() const
 namespace
 {
 ALWAYS_INLINE inline void
-contextSanityCheckWithLock(const Context & context, const Settings & settings, const std::lock_guard<ContextSharedMutex> &)
+contextSanityClampSettingsWithLock(const Context & context, Settings & settings, const std::lock_guard<ContextSharedMutex> &)
 {
     const auto type = context.getApplicationType();
     if (type == Context::ApplicationType::LOCAL || type == Context::ApplicationType::SERVER)
-        doSettingsSanityCheck(settings);
+        doSettingsSanityCheckClamp(settings);
 }
 
-ALWAYS_INLINE inline void contextSanityCheck(const Context & context, const Settings & settings)
+ALWAYS_INLINE inline void contextSanityClampSettings(const Context & context, Settings & settings)
 {
     const auto type = context.getApplicationType();
     if (type == Context::ApplicationType::LOCAL || type == Context::ApplicationType::SERVER)
-        doSettingsSanityCheck(settings);
+        doSettingsSanityCheckClamp(settings);
 }
 }
 
@@ -1498,7 +1498,7 @@ void Context::setCurrentProfilesWithLock(const SettingsProfilesInfo & profiles_i
         checkSettingsConstraintsWithLock(profiles_info.settings, SettingSource::PROFILE);
     applySettingsChangesWithLock(profiles_info.settings, lock);
     settings_constraints_and_current_profiles = profiles_info.getConstraintsAndProfileIDs(settings_constraints_and_current_profiles);
-    contextSanityCheckWithLock(*this, settings, lock);
+    contextSanityClampSettingsWithLock(*this, settings, lock);
 }
 
 void Context::setCurrentProfile(const String & profile_name, bool check_constraints)
@@ -2101,7 +2101,7 @@ void Context::setSettings(const Settings & settings_)
     std::lock_guard lock(mutex);
     settings = settings_;
     need_recalculate_access = true;
-    contextSanityCheck(*this, settings);
+    contextSanityClampSettings(*this, settings);
 }
 
 void Context::setSettingWithLock(std::string_view name, const String & value, const std::lock_guard<ContextSharedMutex> & lock)
@@ -2114,7 +2114,7 @@ void Context::setSettingWithLock(std::string_view name, const String & value, co
     settings.set(name, value);
     if (ContextAccessParams::dependsOnSettingName(name))
         need_recalculate_access = true;
-    contextSanityCheckWithLock(*this, settings, lock);
+    contextSanityClampSettingsWithLock(*this, settings, lock);
 }
 
 void Context::setSettingWithLock(std::string_view name, const Field & value, const std::lock_guard<ContextSharedMutex> & lock)
@@ -2134,7 +2134,7 @@ void Context::applySettingChangeWithLock(const SettingChange & change, const std
     try
     {
         setSettingWithLock(change.name, change.value, lock);
-        contextSanityCheckWithLock(*this, settings, lock);
+        contextSanityClampSettingsWithLock(*this, settings, lock);
     }
     catch (Exception & e)
     {
@@ -2162,7 +2162,7 @@ void Context::setSetting(std::string_view name, const Field & value)
 {
     std::lock_guard lock(mutex);
     setSettingWithLock(name, value, lock);
-    contextSanityCheckWithLock(*this, settings, lock);
+    contextSanityClampSettingsWithLock(*this, settings, lock);
 }
 
 void Context::applySettingChange(const SettingChange & change)
@@ -2187,39 +2187,39 @@ void Context::applySettingsChanges(const SettingsChanges & changes)
     applySettingsChangesWithLock(changes, lock);
 }
 
-void Context::checkSettingsConstraintsWithLock(const SettingsProfileElements & profile_elements, SettingSource source) const
+void Context::checkSettingsConstraintsWithLock(const SettingsProfileElements & profile_elements, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, profile_elements, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheck(settings);
+        doSettingsSanityCheckClamp(settings);
 }
 
-void Context::checkSettingsConstraintsWithLock(const SettingChange & change, SettingSource source) const
+void Context::checkSettingsConstraintsWithLock(const SettingChange & change, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, change, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheck(settings);
+        doSettingsSanityCheckClamp(settings);
 }
 
-void Context::checkSettingsConstraintsWithLock(const SettingsChanges & changes, SettingSource source) const
+void Context::checkSettingsConstraintsWithLock(const SettingsChanges & changes, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, changes, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheck(settings);
+        doSettingsSanityCheckClamp(settings);
 }
 
-void Context::checkSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source) const
+void Context::checkSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, changes, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheck(settings);
+        doSettingsSanityCheckClamp(settings);
 }
 
-void Context::clampToSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source) const
+void Context::clampToSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.clamp(settings, changes, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheck(settings);
+        doSettingsSanityCheckClamp(settings);
 }
 
 void Context::checkMergeTreeSettingsConstraintsWithLock(const MergeTreeSettings & merge_tree_settings, const SettingsChanges & changes) const
@@ -2227,32 +2227,32 @@ void Context::checkMergeTreeSettingsConstraintsWithLock(const MergeTreeSettings
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(merge_tree_settings, changes);
 }
 
-void Context::checkSettingsConstraints(const SettingsProfileElements & profile_elements, SettingSource source) const
+void Context::checkSettingsConstraints(const SettingsProfileElements & profile_elements, SettingSource source)
 {
     SharedLockGuard lock(mutex);
     checkSettingsConstraintsWithLock(profile_elements, source);
 }
 
-void Context::checkSettingsConstraints(const SettingChange & change, SettingSource source) const
+void Context::checkSettingsConstraints(const SettingChange & change, SettingSource source)
 {
     SharedLockGuard lock(mutex);
     checkSettingsConstraintsWithLock(change, source);
 }
 
-void Context::checkSettingsConstraints(const SettingsChanges & changes, SettingSource source) const
+void Context::checkSettingsConstraints(const SettingsChanges & changes, SettingSource source)
 {
     SharedLockGuard lock(mutex);
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, changes, source);
-    doSettingsSanityCheck(settings);
+    doSettingsSanityCheckClamp(settings);
 }
 
-void Context::checkSettingsConstraints(SettingsChanges & changes, SettingSource source) const
+void Context::checkSettingsConstraints(SettingsChanges & changes, SettingSource source)
 {
     SharedLockGuard lock(mutex);
     checkSettingsConstraintsWithLock(changes, source);
 }
 
-void Context::clampToSettingsConstraints(SettingsChanges & changes, SettingSource source) const
+void Context::clampToSettingsConstraints(SettingsChanges & changes, SettingSource source)
 {
     SharedLockGuard lock(mutex);
     clampToSettingsConstraintsWithLock(changes, source);
@@ -4484,7 +4484,7 @@ void Context::setDefaultProfiles(const Poco::Util::AbstractConfiguration & confi
     setCurrentProfile(shared->system_profile_name);
 
     applySettingsQuirks(settings, getLogger("SettingsQuirks"));
-    doSettingsSanityCheck(settings);
+    doSettingsSanityCheckClamp(settings);
 
     shared->buffer_profile_name = config.getString("buffer_profile", shared->system_profile_name);
     buffer_context = Context::createCopy(shared_from_this());
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 1a39dff9cc1..a8ad87831ec 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -783,11 +783,11 @@ public:
     void applySettingsChanges(const SettingsChanges & changes);
 
     /// Checks the constraints.
-    void checkSettingsConstraints(const SettingsProfileElements & profile_elements, SettingSource source) const;
-    void checkSettingsConstraints(const SettingChange & change, SettingSource source) const;
-    void checkSettingsConstraints(const SettingsChanges & changes, SettingSource source) const;
-    void checkSettingsConstraints(SettingsChanges & changes, SettingSource source) const;
-    void clampToSettingsConstraints(SettingsChanges & changes, SettingSource source) const;
+    void checkSettingsConstraints(const SettingsProfileElements & profile_elements, SettingSource source);
+    void checkSettingsConstraints(const SettingChange & change, SettingSource source);
+    void checkSettingsConstraints(const SettingsChanges & changes, SettingSource source);
+    void checkSettingsConstraints(SettingsChanges & changes, SettingSource source);
+    void clampToSettingsConstraints(SettingsChanges & changes, SettingSource source);
     void checkMergeTreeSettingsConstraints(const MergeTreeSettings & merge_tree_settings, const SettingsChanges & changes) const;
 
     /// Reset settings to default value
@@ -1293,15 +1293,15 @@ private:
 
     void setCurrentDatabaseWithLock(const String & name, const std::lock_guard<ContextSharedMutex> & lock);
 
-    void checkSettingsConstraintsWithLock(const SettingsProfileElements & profile_elements, SettingSource source) const;
+    void checkSettingsConstraintsWithLock(const SettingsProfileElements & profile_elements, SettingSource source);
 
-    void checkSettingsConstraintsWithLock(const SettingChange & change, SettingSource source) const;
+    void checkSettingsConstraintsWithLock(const SettingChange & change, SettingSource source);
 
-    void checkSettingsConstraintsWithLock(const SettingsChanges & changes, SettingSource source) const;
+    void checkSettingsConstraintsWithLock(const SettingsChanges & changes, SettingSource source);
 
-    void checkSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source) const;
+    void checkSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source);
 
-    void clampToSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source) const;
+    void clampToSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source);
 
     void checkMergeTreeSettingsConstraintsWithLock(const MergeTreeSettings & merge_tree_settings, const SettingsChanges & changes) const;
 
diff --git a/tests/queries/0_stateless/02994_sanity_check_settings.reference b/tests/queries/0_stateless/02994_sanity_check_settings.reference
index e69de29bb2d..da077c31fd0 100644
--- a/tests/queries/0_stateless/02994_sanity_check_settings.reference
+++ b/tests/queries/0_stateless/02994_sanity_check_settings.reference
@@ -0,0 +1,18 @@
+0	0
+0	0
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+(Expression)
+ExpressionTransform
+  (Limit)
+  Limit
+    (ReadFromStorage)
+    Zeros 0 → 1
diff --git a/tests/queries/0_stateless/02994_sanity_check_settings.sql b/tests/queries/0_stateless/02994_sanity_check_settings.sql
index 073cd9749e6..41c94eef4b3 100644
--- a/tests/queries/0_stateless/02994_sanity_check_settings.sql
+++ b/tests/queries/0_stateless/02994_sanity_check_settings.sql
@@ -13,12 +13,12 @@ FROM numbers(1);
 
 SELECT * APPLY max
 FROM data_02052_1_wide0__fuzz_48
-GROUP BY toFixedString(toFixedString(toFixedString(toFixedString(toFixedString(toLowCardinality('UInt256'), toFixedString(toNullable(toNullable(2)), toFixedString(toFixedString(7), 7)), 7), 7), materialize(toNullable(7))), 7), materialize(7))
+GROUP BY key
 WITH CUBE
-    SETTINGS max_read_buffer_size = 7, max_threads = 9223372036854775807; -- { serverError INVALID_SETTING_VALUE }
+    SETTINGS max_read_buffer_size = 7, max_threads = 9223372036854775807;
 
 SELECT zero + 1 AS x
-FROM system.zeros
-    SETTINGS max_block_size = 9223372036854775806, max_rows_to_read = 20, read_overflow_mode = 'break'; -- { serverError INVALID_SETTING_VALUE }
+FROM system.zeros LIMIT 10
+    SETTINGS max_block_size = 9223372036854775806, max_rows_to_read = 20, read_overflow_mode = 'break';
 
-EXPLAIN PIPELINE SELECT zero + 1 AS x FROM system.zeros SETTINGS max_block_size = 9223372036854775806, max_rows_to_read = 20, read_overflow_mode = 'break'; -- { serverError INVALID_SETTING_VALUE }
+EXPLAIN PIPELINE SELECT zero + 1 AS x FROM system.zeros LIMIT 10 SETTINGS max_block_size = 9223372036854775806, max_rows_to_read = 20, read_overflow_mode = 'break';

From 30d10bf242aa2b9558114e02f3f5af1a0f65d2fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 29 Apr 2024 14:46:42 +0200
Subject: [PATCH 094/192] Style

---
 src/Core/SettingsQuirks.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp
index 6e524c8616f..a5bb9a4ef0d 100644
--- a/src/Core/SettingsQuirks.cpp
+++ b/src/Core/SettingsQuirks.cpp
@@ -48,11 +48,6 @@ bool queryProfilerWorks() { return false; }
 namespace DB
 {
 
-namespace ErrorCodes
-{
-extern const int INVALID_SETTING_VALUE;
-}
-
 /// Update some settings defaults to avoid some known issues.
 void applySettingsQuirks(Settings & settings, LoggerPtr log)
 {

From 05eab7b20469ddc4df9aa773af0974c1dd9a3966 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Mon, 29 Apr 2024 21:54:50 +0800
Subject: [PATCH 095/192] Fix style

---
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index b58e0c5eb4a..12ce8b654bc 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -102,7 +102,7 @@ EmbeddedRocksDBBulkSink::~EmbeddedRocksDBBulkSink()
         if (fs::exists(insert_directory_queue))
             fs::remove_all(insert_directory_queue);
     }
-    catch(...)
+    catch (...)
     {
         tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Error while removing temporary directory {}:", insert_directory_queue));
     }

From bf541dba607a91b8f74bc3598ddd90d6ba8aede6 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:27:00 +0200
Subject: [PATCH 096/192] empty commit


From 75147f3ac71fadf5f72e9c5349b3052b69927992 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Fri, 1 Mar 2024 06:56:27 +0000
Subject: [PATCH 097/192] S3-plain based disk supporting directory rename

Fixes: https://github.com/ClickHouse/ClickHouse/issues/58347
---
 programs/keeper/CMakeLists.txt                |   3 +
 src/Common/ObjectStorageKeyGenerator.cpp      |   9 +-
 src/Common/ObjectStorageKeyGenerator.h        |   5 +-
 .../CommonPathPrefixKeyGenerator.cpp          |  69 ++++++
 .../CommonPathPrefixKeyGenerator.h            |  26 ++
 .../ObjectStorages/DiskObjectStorage.cpp      |  30 ++-
 src/Disks/ObjectStorages/DiskObjectStorage.h  |   3 +-
 .../DiskObjectStorageTransaction.cpp          |  17 +-
 src/Disks/ObjectStorages/IMetadataOperation.h |  18 ++
 src/Disks/ObjectStorages/IObjectStorage.cpp   |   9 +-
 src/Disks/ObjectStorages/IObjectStorage.h     |   9 +
 .../MetadataOperationsHolder.cpp              |  94 ++++++++
 .../ObjectStorages/MetadataOperationsHolder.h |  26 ++
 .../MetadataStorageFromDisk.cpp               |  84 +------
 .../ObjectStorages/MetadataStorageFromDisk.h  |  11 +-
 ...dataStorageFromDiskTransactionOperations.h |  13 +-
 .../MetadataStorageFromPlainObjectStorage.cpp | 222 +++++++++++++++---
 .../MetadataStorageFromPlainObjectStorage.h   |  32 ++-
 ...torageFromPlainObjectStorageOperations.cpp | 168 +++++++++++++
 ...aStorageFromPlainObjectStorageOperations.h |  72 ++++++
 .../ObjectStorages/ObjectStorageFactory.cpp   |  38 ++-
 .../RegisterDiskObjectStorage.cpp             |   1 +
 .../ObjectStorages/S3/S3ObjectStorage.cpp     |  10 +-
 src/Disks/ObjectStorages/S3/S3ObjectStorage.h |  10 +-
 .../S3/S3PlainRewritableObjectStorage.h       |  23 ++
 src/Storages/MergeTree/MergeTreeData.cpp      |   2 +-
 .../03008_s3_plain_rewritable.reference       |   1 +
 .../0_stateless/03008_s3_plain_rewritable.sql |  18 ++
 28 files changed, 836 insertions(+), 187 deletions(-)
 create mode 100644 src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
 create mode 100644 src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
 create mode 100644 src/Disks/ObjectStorages/IMetadataOperation.h
 create mode 100644 src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
 create mode 100644 src/Disks/ObjectStorages/MetadataOperationsHolder.h
 create mode 100644 src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
 create mode 100644 src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
 create mode 100644 src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h
 create mode 100644 tests/queries/0_stateless/03008_s3_plain_rewritable.reference
 create mode 100644 tests/queries/0_stateless/03008_s3_plain_rewritable.sql

diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt
index 70e0f229fd4..88964465878 100644
--- a/programs/keeper/CMakeLists.txt
+++ b/programs/keeper/CMakeLists.txt
@@ -121,6 +121,8 @@ if (BUILD_STANDALONE_KEEPER)
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/DiskType.cpp
 
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/IObjectStorage.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp
@@ -137,6 +139,7 @@ if (BUILD_STANDALONE_KEEPER)
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/S3Capabilities.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/diskSettings.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/S3/DiskS3Utils.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/ObjectStorageFactory.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFactory.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp
diff --git a/src/Common/ObjectStorageKeyGenerator.cpp b/src/Common/ObjectStorageKeyGenerator.cpp
index 7b4507a3abc..e9212c3f04d 100644
--- a/src/Common/ObjectStorageKeyGenerator.cpp
+++ b/src/Common/ObjectStorageKeyGenerator.cpp
@@ -14,10 +14,7 @@ public:
     , re_gen(key_template)
     {
     }
-    DB::ObjectStorageKey generate(const String &) const override
-    {
-        return DB::ObjectStorageKey::createAsAbsolute(re_gen.generate());
-    }
+    DB::ObjectStorageKey generate(const String &, bool) const override { return DB::ObjectStorageKey::createAsAbsolute(re_gen.generate()); }
 
 private:
     String key_template;
@@ -32,7 +29,7 @@ public:
         : key_prefix(std::move(key_prefix_))
     {}
 
-    DB::ObjectStorageKey generate(const String &) const override
+    DB::ObjectStorageKey generate(const String &, bool) const override
     {
         /// Path to store the new S3 object.
 
@@ -63,7 +60,7 @@ public:
         : key_prefix(std::move(key_prefix_))
     {}
 
-    DB::ObjectStorageKey generate(const String & path) const override
+    DB::ObjectStorageKey generate(const String & path, bool) const override
     {
         return DB::ObjectStorageKey::createAsRelative(key_prefix, path);
     }
diff --git a/src/Common/ObjectStorageKeyGenerator.h b/src/Common/ObjectStorageKeyGenerator.h
index 29f2a4a22c2..11da039b33b 100644
--- a/src/Common/ObjectStorageKeyGenerator.h
+++ b/src/Common/ObjectStorageKeyGenerator.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "ObjectStorageKey.h"
 #include <memory>
+#include "ObjectStorageKey.h"
 
 namespace DB
 {
@@ -9,8 +9,9 @@ namespace DB
 class IObjectStorageKeysGenerator
 {
 public:
-    virtual ObjectStorageKey generate(const String & path) const = 0;
     virtual ~IObjectStorageKeysGenerator() = default;
+
+    virtual ObjectStorageKey generate(const String & path, bool is_directory) const = 0;
 };
 
 using ObjectStorageKeysGeneratorPtr = std::shared_ptr<IObjectStorageKeysGenerator>;
diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
new file mode 100644
index 00000000000..7a3c2a7a847
--- /dev/null
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
@@ -0,0 +1,69 @@
+#include "CommonPathPrefixKeyGenerator.h"
+
+#include <Common/Exception.h>
+#include <Common/getRandomASCIIString.h>
+
+#include <deque>
+#include <filesystem>
+#include <tuple>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+extern const int LOGICAL_ERROR;
+}
+
+CommonPathPrefixKeyGenerator::CommonPathPrefixKeyGenerator(String key_prefix_, std::weak_ptr<PathMap> path_map_)
+    : key_prefix(key_prefix_), path_map(std::move(path_map_))
+{
+}
+
+ObjectStorageKey CommonPathPrefixKeyGenerator::generate(const String & path, bool is_directory) const
+{
+    auto result = getLongestPrefix(path);
+
+    const auto & unrealized_parts = std::get<1>(result);
+
+    if (unrealized_parts.size() >= 2)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Can not find object key prefix for path {}", path);
+
+    auto key = std::filesystem::path(std::get<0>(result));
+    if (unrealized_parts.empty())
+        return ObjectStorageKey::createAsRelative(std::move(key));
+
+    constexpr size_t part_size = 8;
+    if (is_directory)
+        key /= getRandomASCIIString(part_size);
+    else
+        key /= unrealized_parts.front();
+
+    return ObjectStorageKey::createAsRelative(key);
+}
+
+std::tuple<std::string, std::vector<std::string>> CommonPathPrefixKeyGenerator::getLongestPrefix(const std::string & path) const
+{
+    std::filesystem::path p(path);
+    std::deque<std::string> dq;
+
+    auto ptr = path_map.lock();
+
+    while (p != p.root_path())
+    {
+        auto it = ptr->find(p / "");
+        if (it != ptr->end())
+        {
+            std::vector<std::string> vec(std::make_move_iterator(dq.begin()), std::make_move_iterator(dq.end()));
+            return std::make_tuple(it->second, std::move(vec));
+        }
+
+        if (!p.filename().empty())
+            dq.push_front(p.filename());
+
+        p = p.parent_path();
+    }
+
+    return {key_prefix, std::vector<std::string>(std::make_move_iterator(dq.begin()), std::make_move_iterator(dq.end()))};
+}
+
+}
diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
new file mode 100644
index 00000000000..e14f45c500f
--- /dev/null
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <Common/ObjectStorageKeyGenerator.h>
+
+#include <unordered_map>
+
+namespace DB
+{
+
+class CommonPathPrefixKeyGenerator : public IObjectStorageKeysGenerator
+{
+public:
+    using PathMap = std::unordered_map<std::string, std::string>;
+
+    explicit CommonPathPrefixKeyGenerator(String key_prefix_, std::weak_ptr<PathMap> path_map_);
+
+    ObjectStorageKey generate(const String & path, bool is_directory) const override;
+
+private:
+    std::tuple<std::string, std::vector<String>> getLongestPrefix(const String & path) const;
+
+    String key_prefix;
+    std::weak_ptr<PathMap> path_map;
+};
+
+}
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
index c43845116dd..06abec71567 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@@ -112,20 +112,21 @@ size_t DiskObjectStorage::getFileSize(const String & path) const
     return metadata_storage->getFileSize(path);
 }
 
+void DiskObjectStorage::moveDirectory(const String & from_path, const String & to_path)
+{
+    if (send_metadata)
+        sendMoveMetadata(from_path, to_path);
+
+    auto transaction = createObjectStorageTransaction();
+    transaction->moveDirectory(from_path, to_path);
+    transaction->commit();
+}
+
 void DiskObjectStorage::moveFile(const String & from_path, const String & to_path, bool should_send_metadata)
 {
 
     if (should_send_metadata)
-    {
-        auto revision = metadata_helper->revision_counter + 1;
-        metadata_helper->revision_counter += 1;
-
-        const ObjectAttributes object_metadata {
-            {"from_path", from_path},
-            {"to_path", to_path}
-        };
-        metadata_helper->createFileOperationObject("rename", revision, object_metadata);
-    }
+        sendMoveMetadata(from_path, to_path);
 
     auto transaction = createObjectStorageTransaction();
     transaction->moveFile(from_path, to_path);
@@ -409,6 +410,15 @@ bool DiskObjectStorage::tryReserve(UInt64 bytes)
 
     return false;
 }
+void DiskObjectStorage::sendMoveMetadata(const String & from_path, const String & to_path)
+{
+    chassert(send_metadata);
+    auto revision = metadata_helper->revision_counter + 1;
+    metadata_helper->revision_counter += 1;
+
+    const ObjectAttributes object_metadata{{"from_path", from_path}, {"to_path", to_path}};
+    metadata_helper->createFileOperationObject("rename", revision, object_metadata);
+}
 
 bool DiskObjectStorage::supportsCache() const
 {
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h
index 88c5e3203b8..9702573b875 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.h
@@ -112,7 +112,7 @@ public:
 
     void clearDirectory(const String & path) override;
 
-    void moveDirectory(const String & from_path, const String & to_path) override { moveFile(from_path, to_path); }
+    void moveDirectory(const String & from_path, const String & to_path) override;
 
     void removeDirectory(const String & path) override;
 
@@ -228,6 +228,7 @@ private:
     std::mutex reservation_mutex;
 
     bool tryReserve(UInt64 bytes);
+    void sendMoveMetadata(const String & from_path, const String & to_path);
 
     const bool send_metadata;
 
diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp
index d25add625e8..4e364e44624 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp
@@ -507,7 +507,7 @@ struct CopyFileObjectStorageOperation final : public IDiskObjectStorageOperation
     std::string to_path;
 
     StoredObjects created_objects;
-    IObjectStorage& destination_object_storage;
+    IObjectStorage & destination_object_storage;
 
     CopyFileObjectStorageOperation(
         IObjectStorage & object_storage_,
@@ -714,7 +714,7 @@ std::unique_ptr<WriteBufferFromFileBase> DiskObjectStorageTransaction::writeFile
             {
                 /// Otherwise we will produce lost blobs which nobody points to
                 /// WriteOnce storages are not affected by the issue
-                if (!tx->object_storage.isWriteOnce() && tx->metadata_storage.exists(path))
+                if (!tx->object_storage.isPlain() && tx->metadata_storage.exists(path))
                     tx->object_storage.removeObjectsIfExist(tx->metadata_storage.getStorageObjects(path));
 
                 tx->metadata_transaction->createMetadataFile(path, key_, count);
@@ -747,10 +747,9 @@ std::unique_ptr<WriteBufferFromFileBase> DiskObjectStorageTransaction::writeFile
                 {
                     /// Otherwise we will produce lost blobs which nobody points to
                     /// WriteOnce storages are not affected by the issue
-                    if (!object_storage_tx->object_storage.isWriteOnce() && object_storage_tx->metadata_storage.exists(path))
+                    if (!object_storage_tx->object_storage.isPlain() && object_storage_tx->metadata_storage.exists(path))
                     {
-                        object_storage_tx->object_storage.removeObjectsIfExist(
-                            object_storage_tx->metadata_storage.getStorageObjects(path));
+                        object_storage_tx->object_storage.removeObjectsIfExist(object_storage_tx->metadata_storage.getStorageObjects(path));
                     }
 
                     tx->createMetadataFile(path, key_, count);
@@ -877,14 +876,14 @@ void DiskObjectStorageTransaction::createFile(const std::string & path)
 
 void DiskObjectStorageTransaction::copyFile(const std::string & from_file_path, const std::string & to_file_path, const ReadSettings & read_settings, const WriteSettings & write_settings)
 {
-    operations_to_execute.emplace_back(
-        std::make_unique<CopyFileObjectStorageOperation>(object_storage, metadata_storage, object_storage, read_settings, write_settings, from_file_path, to_file_path));
+    operations_to_execute.emplace_back(std::make_unique<CopyFileObjectStorageOperation>(
+        object_storage, metadata_storage, object_storage, read_settings, write_settings, from_file_path, to_file_path));
 }
 
 void MultipleDisksObjectStorageTransaction::copyFile(const std::string & from_file_path, const std::string & to_file_path, const ReadSettings & read_settings, const WriteSettings & write_settings)
 {
-    operations_to_execute.emplace_back(
-        std::make_unique<CopyFileObjectStorageOperation>(object_storage, metadata_storage, destination_object_storage, read_settings, write_settings, from_file_path, to_file_path));
+    operations_to_execute.emplace_back(std::make_unique<CopyFileObjectStorageOperation>(
+        object_storage, metadata_storage, destination_object_storage, read_settings, write_settings, from_file_path, to_file_path));
 }
 
 void DiskObjectStorageTransaction::commit()
diff --git a/src/Disks/ObjectStorages/IMetadataOperation.h b/src/Disks/ObjectStorages/IMetadataOperation.h
new file mode 100644
index 00000000000..e4b241ad388
--- /dev/null
+++ b/src/Disks/ObjectStorages/IMetadataOperation.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <Common/SharedMutex.h>
+
+namespace DB
+{
+
+struct IMetadataOperation
+{
+    virtual void execute(std::unique_lock<SharedMutex> & metadata_lock) = 0;
+    virtual void undo() = 0;
+    virtual void finalize() { }
+    virtual ~IMetadataOperation() = default;
+};
+
+using MetadataOperationPtr = std::unique_ptr<IMetadataOperation>;
+
+}
diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp
index 78fbdcaddfa..5fd9695ec9e 100644
--- a/src/Disks/ObjectStorages/IObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/IObjectStorage.cpp
@@ -1,11 +1,12 @@
-#include <Disks/ObjectStorages/IObjectStorage.h>
 #include <Disks/IO/ThreadPoolRemoteFSReader.h>
-#include <Common/Exception.h>
+#include <Disks/ObjectStorages/IObjectStorage.h>
+#include <Disks/ObjectStorages/ObjectStorageIterator.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <IO/WriteBufferFromFileBase.h>
 #include <IO/copyData.h>
-#include <IO/ReadBufferFromFileBase.h>
 #include <Interpreters/Context.h>
-#include <Disks/ObjectStorages/ObjectStorageIterator.h>
+#include <Common/Exception.h>
+#include <Common/ObjectStorageKeyGenerator.h>
 
 
 namespace DB
diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h
index fde97d82ad1..8ac4e609ffa 100644
--- a/src/Disks/ObjectStorages/IObjectStorage.h
+++ b/src/Disks/ObjectStorages/IObjectStorage.h
@@ -83,6 +83,9 @@ using ObjectKeysWithMetadata = std::vector<ObjectKeyWithMetadata>;
 class IObjectStorageIterator;
 using ObjectStorageIteratorPtr = std::shared_ptr<IObjectStorageIterator>;
 
+class IObjectStorageKeysGenerator;
+using ObjectStorageKeysGeneratorPtr = std::shared_ptr<IObjectStorageKeysGenerator>;
+
 /// Base class for all object storages which implement some subset of ordinary filesystem operations.
 ///
 /// Examples of object storages are S3, Azure Blob Storage, HDFS.
@@ -207,6 +210,10 @@ public:
     /// Generate blob name for passed absolute local path.
     /// Path can be generated either independently or based on `path`.
     virtual ObjectStorageKey generateObjectKeyForPath(const std::string & path) const = 0;
+    virtual std::string generateObjectKeyPrefixForDirectoryPath(const std::string & /* path */) const
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented");
+    }
 
     /// Get unique id for passed absolute path in object storage.
     virtual std::string getUniqueId(const std::string & path) const { return path; }
@@ -226,6 +233,8 @@ public:
 
     virtual WriteSettings patchSettings(const WriteSettings & write_settings) const;
 
+    virtual void setKeysGenerator(ObjectStorageKeysGeneratorPtr) { }
+
 #if USE_AZURE_BLOB_STORAGE
     virtual std::shared_ptr<const Azure::Storage::Blobs::BlobContainerClient> getAzureBlobStorageClient()
     {
diff --git a/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp b/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
new file mode 100644
index 00000000000..f051f62fa41
--- /dev/null
+++ b/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
@@ -0,0 +1,94 @@
+#include "MetadataOperationsHolder.h"
+
+#include <Common/Exception.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int FS_METADATA_ERROR;
+}
+
+void MetadataOperationsHolder::rollback(size_t until_pos)
+{
+    /// Otherwise everything is alright
+    if (state == MetadataFromDiskTransactionState::FAILED)
+    {
+        for (int64_t i = until_pos; i >= 0; --i)
+        {
+            try
+            {
+                operations[i]->undo();
+            }
+            catch (Exception & ex)
+            {
+                state = MetadataFromDiskTransactionState::PARTIALLY_ROLLED_BACK;
+                ex.addMessage(fmt::format("While rolling back operation #{}", i));
+                throw;
+            }
+        }
+    }
+    else
+    {
+        /// Nothing to do, transaction committed or not even started to commit
+    }
+}
+
+void MetadataOperationsHolder::addOperation(MetadataOperationPtr && operation)
+{
+    if (state != MetadataFromDiskTransactionState::PREPARING)
+        throw Exception(
+            ErrorCodes::FS_METADATA_ERROR,
+            "Cannot add operations to transaction in {} state, it should be in {} state",
+            toString(state),
+            toString(MetadataFromDiskTransactionState::PREPARING));
+
+    operations.emplace_back(std::move(operation));
+}
+
+void MetadataOperationsHolder::commitImpl(SharedMutex & metadata_mutex)
+{
+    if (state != MetadataFromDiskTransactionState::PREPARING)
+        throw Exception(
+            ErrorCodes::FS_METADATA_ERROR,
+            "Cannot commit transaction in {} state, it should be in {} state",
+            toString(state),
+            toString(MetadataFromDiskTransactionState::PREPARING));
+
+    {
+        std::unique_lock lock(metadata_mutex);
+        for (size_t i = 0; i < operations.size(); ++i)
+        {
+            try
+            {
+                operations[i]->execute(lock);
+            }
+            catch (Exception & ex)
+            {
+                tryLogCurrentException(__PRETTY_FUNCTION__);
+                ex.addMessage(fmt::format("While committing metadata operation #{}", i));
+                state = MetadataFromDiskTransactionState::FAILED;
+                rollback(i);
+                throw;
+            }
+        }
+    }
+
+    /// Do it in "best effort" mode
+    for (size_t i = 0; i < operations.size(); ++i)
+    {
+        try
+        {
+            operations[i]->finalize();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Failed to finalize operation #{}", i));
+        }
+    }
+
+    state = MetadataFromDiskTransactionState::COMMITTED;
+}
+
+}
diff --git a/src/Disks/ObjectStorages/MetadataOperationsHolder.h b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
new file mode 100644
index 00000000000..dc090675e4e
--- /dev/null
+++ b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <Disks/ObjectStorages/IMetadataOperation.h>
+#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
+
+/**
+ * Implementations for transactional operations with metadata used by MetadataStorageFromDisk.
+ */
+
+namespace DB
+{
+
+class MetadataOperationsHolder
+{
+private:
+    std::vector<MetadataOperationPtr> operations;
+    MetadataFromDiskTransactionState state{MetadataFromDiskTransactionState::PREPARING};
+
+    void rollback(size_t until_pos);
+
+protected:
+    void addOperation(MetadataOperationPtr && operation);
+    void commitImpl(SharedMutex & metadata_mutex);
+};
+
+}
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
index 9b9c4eb388c..ab952888419 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
@@ -10,14 +10,8 @@
 namespace DB
 {
 
-namespace ErrorCodes
-{
-    extern const int FS_METADATA_ERROR;
-}
-
 MetadataStorageFromDisk::MetadataStorageFromDisk(DiskPtr disk_, String compatible_key_prefix_)
-    : disk(disk_)
-    , compatible_key_prefix(compatible_key_prefix_)
+    : disk(disk_), compatible_key_prefix(compatible_key_prefix_)
 {
 }
 
@@ -158,83 +152,9 @@ const IMetadataStorage & MetadataStorageFromDiskTransaction::getStorageForNonTra
     return metadata_storage;
 }
 
-void MetadataStorageFromDiskTransaction::addOperation(MetadataOperationPtr && operation)
-{
-    if (state != MetadataFromDiskTransactionState::PREPARING)
-        throw Exception(
-            ErrorCodes::FS_METADATA_ERROR,
-            "Cannot add operations to transaction in {} state, it should be in {} state",
-            toString(state), toString(MetadataFromDiskTransactionState::PREPARING));
-
-    operations.emplace_back(std::move(operation));
-}
-
 void MetadataStorageFromDiskTransaction::commit()
 {
-    if (state != MetadataFromDiskTransactionState::PREPARING)
-        throw Exception(
-            ErrorCodes::FS_METADATA_ERROR,
-            "Cannot commit transaction in {} state, it should be in {} state",
-            toString(state), toString(MetadataFromDiskTransactionState::PREPARING));
-
-    {
-        std::unique_lock lock(metadata_storage.metadata_mutex);
-        for (size_t i = 0; i < operations.size(); ++i)
-        {
-            try
-            {
-                operations[i]->execute(lock);
-            }
-            catch (Exception & ex)
-            {
-                tryLogCurrentException(__PRETTY_FUNCTION__);
-                ex.addMessage(fmt::format("While committing metadata operation #{}", i));
-                state = MetadataFromDiskTransactionState::FAILED;
-                rollback(i);
-                throw;
-            }
-        }
-    }
-
-    /// Do it in "best effort" mode
-    for (size_t i = 0; i < operations.size(); ++i)
-    {
-        try
-        {
-            operations[i]->finalize();
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Failed to finalize operation #{}", i));
-        }
-    }
-
-    state = MetadataFromDiskTransactionState::COMMITTED;
-}
-
-void MetadataStorageFromDiskTransaction::rollback(size_t until_pos)
-{
-    /// Otherwise everything is alright
-    if (state == MetadataFromDiskTransactionState::FAILED)
-    {
-        for (int64_t i = until_pos; i >= 0; --i)
-        {
-            try
-            {
-                operations[i]->undo();
-            }
-            catch (Exception & ex)
-            {
-                state = MetadataFromDiskTransactionState::PARTIALLY_ROLLED_BACK;
-                ex.addMessage(fmt::format("While rolling back operation #{}", i));
-                throw;
-            }
-        }
-    }
-    else
-    {
-        /// Nothing to do, transaction committed or not even started to commit
-    }
+    MetadataOperationsHolder::commitImpl(metadata_storage.metadata_mutex);
 }
 
 void MetadataStorageFromDiskTransaction::writeStringToFile(
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
index 7059d8e9a6a..5dca40afc59 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
@@ -6,6 +6,7 @@
 #include <Disks/IDisk.h>
 #include <Disks/ObjectStorages/DiskObjectStorageMetadata.h>
 #include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
+#include <Disks/ObjectStorages/MetadataOperationsHolder.h>
 #include <Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h>
 
 namespace DB
@@ -74,18 +75,11 @@ public:
     DiskObjectStorageMetadataPtr readMetadataUnlocked(const std::string & path, std::shared_lock<SharedMutex> & lock) const;
 };
 
-class MetadataStorageFromDiskTransaction final : public IMetadataTransaction
+class MetadataStorageFromDiskTransaction final : public IMetadataTransaction, private MetadataOperationsHolder
 {
 private:
     const MetadataStorageFromDisk & metadata_storage;
 
-    std::vector<MetadataOperationPtr> operations;
-    MetadataFromDiskTransactionState state{MetadataFromDiskTransactionState::PREPARING};
-
-    void addOperation(MetadataOperationPtr && operation);
-
-    void rollback(size_t until_pos);
-
 public:
     explicit MetadataStorageFromDiskTransaction(const MetadataStorageFromDisk & metadata_storage_)
         : metadata_storage(metadata_storage_)
@@ -135,7 +129,6 @@ public:
 
     UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) override;
 
-
 };
 
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h
index e8fda177b95..4c5fa0f6e78 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <Common/SharedMutex.h>
+#include <Disks/ObjectStorages/IMetadataOperation.h>
 #include <Disks/ObjectStorages/IMetadataStorage.h>
 
 #include <numeric>
@@ -14,17 +14,6 @@ class IDisk;
  * Implementations for transactional operations with metadata used by MetadataStorageFromDisk.
  */
 
-struct IMetadataOperation
-{
-    virtual void execute(std::unique_lock<SharedMutex> & metadata_lock) = 0;
-    virtual void undo() = 0;
-    virtual void finalize() {}
-    virtual ~IMetadataOperation() = default;
-};
-
-using MetadataOperationPtr = std::unique_ptr<IMetadataOperation>;
-
-
 struct SetLastModifiedOperation final : public IMetadataOperation
 {
     SetLastModifiedOperation(const std::string & path_, Poco::Timestamp new_timestamp_, IDisk & disk_);
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index 4b8fc74e956..f3dd3daf8e2 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -1,26 +1,80 @@
 #include "MetadataStorageFromPlainObjectStorage.h"
 #include <Disks/IDisk.h>
+#include <Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h>
 #include <Disks/ObjectStorages/StaticDirectoryIterator.h>
-#include <Common/filesystemHelpers.h>
-#include <Common/logger_useful.h>
-#include <Common/StringUtils/StringUtils.h>
-#include <IO/WriteHelpers.h>
 
+#include <IO/ReadHelpers.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/filesystemHelpers.h>
+#include <Common/getRandomASCIIString.h>
+#include <Common/logger_useful.h>
+#include "CommonPathPrefixKeyGenerator.h"
+
+#include <filesystem>
+#include <ranges>
+#include <tuple>
 
 namespace DB
 {
 
-MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage(
-    ObjectStoragePtr object_storage_,
-    String storage_path_prefix_)
+namespace
+{
+
+std::filesystem::path normalizePath(const std::filesystem::path & path)
+{
+    return std::filesystem::path(path).lexically_normal();
+}
+
+std::filesystem::path normalizeDirectoryPath(const std::filesystem::path & path)
+{
+    return normalizePath(path) / "";
+}
+
+MetadataStorageFromPlainObjectStorage::PathMap loadPathPrefixMap(const std::string & root, ObjectStoragePtr object_storage)
+{
+    constexpr auto PREFIX_PATH_FILE_NAME = "prefix.path";
+    MetadataStorageFromPlainObjectStorage::PathMap result;
+
+    RelativePathsWithMetadata files;
+    object_storage->listObjects(root, files, 0);
+    for (const auto & file : files)
+    {
+        auto remote_path = std::filesystem::path(file.relative_path);
+        if (remote_path.filename() != PREFIX_PATH_FILE_NAME)
+            continue;
+
+        StoredObject object{file.relative_path};
+
+        auto read_buf = object_storage->readObject(object, {});
+        String content;
+        readString(content, *read_buf);
+
+        result.emplace(content, remote_path.parent_path().string());
+    }
+    return result;
+}
+
+}
+
+MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage(ObjectStoragePtr object_storage_, String storage_path_prefix_)
     : object_storage(object_storage_)
     , storage_path_prefix(std::move(storage_path_prefix_))
+    , path_map(std::make_shared<PathMap>(loadPathPrefixMap(object_storage->getCommonKeyPrefix(), object_storage)))
 {
+    LOG_TRACE(
+        getLogger("MetadataStorageFromPlainObjectStorage"), "MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage");
+
+    if (!object_storage->isWriteOnce())
+    {
+        auto keys_gen = std::make_shared<CommonPathPrefixKeyGenerator>(object_storage->getCommonKeyPrefix(), path_map);
+        object_storage->setKeysGenerator(keys_gen);
+    }
 }
 
+
 MetadataTransactionPtr MetadataStorageFromPlainObjectStorage::createTransaction()
 {
-    return std::make_shared<MetadataStorageFromPlainObjectStorageTransaction>(*this);
+    return std::make_shared<MetadataStorageFromPlainObjectStorageTransaction>(*this, object_storage);
 }
 
 const std::string & MetadataStorageFromPlainObjectStorage::getPath() const
@@ -44,8 +98,19 @@ bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) con
 
 bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path) const
 {
-    auto object_key = object_storage->generateObjectKeyForPath(path);
-    std::string directory = object_key.serialize();
+    auto key_prefix = [&] -> std::optional<std::string>
+    {
+        if (object_storage->isWriteOnce())
+            return object_storage->generateObjectKeyForPath(path).serialize();
+
+        auto it = path_map->find(normalizeDirectoryPath(path));
+        return it == path_map->end() ? std::nullopt : std::make_optional(it->second);
+    }();
+
+    if (!key_prefix)
+        return false;
+
+    std::string directory = key_prefix.value();
     if (!directory.ends_with('/'))
         directory += '/';
     return object_storage->existsOrHasAnyChild(directory);
@@ -60,37 +125,82 @@ uint64_t MetadataStorageFromPlainObjectStorage::getFileSize(const String & path)
     return 0;
 }
 
+namespace
+{
+
+std::vector<std::string> getDirectChildrenOnWriteOnceDisk(const std::string & storage_key, const RelativePathsWithMetadata & remote_paths)
+{
+    std::unordered_set<std::string> duplicates_filter;
+    for (const auto & elem : remote_paths)
+    {
+        const auto & path = elem.relative_path;
+        chassert(path.find(storage_key) == 0);
+        const auto child_pos = storage_key.size();
+        /// string::npos is ok.
+        const auto slash_pos = path.find('/', child_pos);
+        if (slash_pos == std::string::npos)
+            duplicates_filter.emplace(path.substr(child_pos));
+        else
+            duplicates_filter.emplace(path.substr(child_pos, slash_pos - child_pos));
+    }
+    return std::vector<std::string>(std::make_move_iterator(duplicates_filter.begin()), std::make_move_iterator(duplicates_filter.end()));
+}
+
+std::vector<std::string> getDirectChildrenOnRewritableDisk(
+    const std::string & storage_key,
+    const RelativePathsWithMetadata & remote_paths,
+    const std::string & local_path,
+    const MetadataStorageFromPlainObjectStorage::PathMap & local_path_prefixes)
+{
+    /// Subdirectories.
+    std::unordered_set<std::string> duplicates_filter;
+    for (const auto & [prefix, _] : local_path_prefixes)
+    {
+        if (!prefix.starts_with(local_path))
+            continue;
+
+        auto slash_num = count(prefix.begin() + local_path.size(), prefix.end(), '/');
+        if (slash_num != 1)
+            continue;
+
+        chassert(prefix.back() == '/');
+        duplicates_filter.emplace(std::string(prefix.begin() + local_path.size(), prefix.end() - 1));
+    }
+
+    /// File names.
+    for (const auto & elem : remote_paths)
+    {
+        const auto & path = elem.relative_path;
+        chassert(path.find(storage_key) == 0);
+        const auto child_pos = storage_key.size();
+
+        if (path.find('/', child_pos) == std::string::npos)
+            duplicates_filter.emplace(path.substr(child_pos));
+    }
+
+    return std::vector<std::string>(std::make_move_iterator(duplicates_filter.begin()), std::make_move_iterator(duplicates_filter.end()));
+}
+}
+
 std::vector<std::string> MetadataStorageFromPlainObjectStorage::listDirectory(const std::string & path) const
 {
-    auto object_key = object_storage->generateObjectKeyForPath(path);
+    auto key_prefix = object_storage->isWriteOnce() ? object_storage->generateObjectKeyForPath(path).serialize()
+                                                    : object_storage->generateObjectKeyPrefixForDirectoryPath(path);
 
     RelativePathsWithMetadata files;
-    std::string abs_key = object_key.serialize();
+    std::string abs_key = key_prefix;
     if (!abs_key.ends_with('/'))
         abs_key += '/';
 
     object_storage->listObjects(abs_key, files, 0);
 
-    std::vector<std::string> result;
-    for (const auto & path_size : files)
-    {
-        result.push_back(path_size.relative_path);
-    }
-
-    std::unordered_set<std::string> duplicates_filter;
-    for (auto & row : result)
-    {
-        chassert(row.starts_with(abs_key));
-        row.erase(0, abs_key.size());
-        auto slash_pos = row.find_first_of('/');
-        if (slash_pos != std::string::npos)
-            row.erase(slash_pos, row.size() - slash_pos);
-        duplicates_filter.insert(row);
-    }
-
-    return std::vector<std::string>(duplicates_filter.begin(), duplicates_filter.end());
+    if (object_storage->isWriteOnce())
+        return getDirectChildrenOnWriteOnceDisk(abs_key, files);
+    else
+        return getDirectChildrenOnRewritableDisk(abs_key, files, path, *path_map);
 }
 
+
 DirectoryIteratorPtr MetadataStorageFromPlainObjectStorage::iterateDirectory(const std::string & path) const
 {
     /// Required for MergeTree
@@ -122,18 +232,58 @@ void MetadataStorageFromPlainObjectStorageTransaction::unlinkFile(const std::str
 
 void MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(const std::string & path)
 {
+    /// TODO: Make transactional.
     for (auto it = metadata_storage.iterateDirectory(path); it->isValid(); it->next())
         metadata_storage.object_storage->removeObject(StoredObject(it->path()));
+    if (!metadata_storage.object_storage->isWriteOnce())
+        addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation>(
+            path, *metadata_storage.path_map, object_storage));
 }
 
-void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std::string &)
+void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std::string & path)
 {
-    /// Noop. It is an Object Storage not a filesystem.
+    if (metadata_storage.object_storage->isWriteOnce())
+        return;
+
+    auto op = std::make_unique<MetadataStorageFromPlainObjectStorageCreateDirectoryOperation>(
+        normalizeDirectoryPath(path), *metadata_storage.path_map, object_storage);
+    addOperation(std::move(op));
 }
-void MetadataStorageFromPlainObjectStorageTransaction::createDirectoryRecursive(const std::string &)
+
+void MetadataStorageFromPlainObjectStorageTransaction::createDirectoryRecursive(const std::string & path)
 {
-    /// Noop. It is an Object Storage not a filesystem.
+    if (metadata_storage.object_storage->isWriteOnce())
+        return;
+
+    auto p = normalizeDirectoryPath(path);
+    std::vector<std::filesystem::path> paths_created;
+
+    while (true)
+    {
+        paths_created.push_back(p);
+        if (!p.has_parent_path())
+            break;
+
+        p = p.parent_path();
+    }
+
+    for (auto path_to_create : paths_created | std::views::reverse)
+    {
+        auto op = std::make_unique<MetadataStorageFromPlainObjectStorageCreateDirectoryOperation>(
+            path_to_create / "", *metadata_storage.path_map, object_storage);
+        addOperation(std::move(op));
+    }
 }
+
+void MetadataStorageFromPlainObjectStorageTransaction::moveDirectory(const std::string & path_from, const std::string & path_to)
+{
+    if (metadata_storage.object_storage->isWriteOnce())
+        return;
+
+    addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageMoveDirectoryOperation>(
+        normalizeDirectoryPath(path_from), normalizeDirectoryPath(path_to), *metadata_storage.path_map, object_storage));
+}
+
 void MetadataStorageFromPlainObjectStorageTransaction::addBlobToMetadata(
     const std::string &, ObjectStorageKey /* object_key */, uint64_t /* size_in_bytes */)
 {
@@ -146,4 +296,8 @@ UnlinkMetadataFileOperationOutcomePtr MetadataStorageFromPlainObjectStorageTrans
     return std::make_shared<UnlinkMetadataFileOperationOutcome>(UnlinkMetadataFileOperationOutcome{0});
 }
 
+void MetadataStorageFromPlainObjectStorageTransaction::commit()
+{
+    MetadataOperationsHolder::commitImpl(metadata_storage.metadata_mutex);
+}
 }
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index 09623716b1f..cff7d48f368 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -3,8 +3,9 @@
 #include <Disks/IDisk.h>
 #include <Disks/ObjectStorages/IMetadataStorage.h>
 #include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
-#include <Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h>
+#include <Disks/ObjectStorages/MetadataOperationsHolder.h>
 
+#include <unordered_map>
 
 namespace DB
 {
@@ -25,12 +26,19 @@ using UnlinkMetadataFileOperationOutcomePtr = std::shared_ptr<UnlinkMetadataFile
 /// disk to restore.
 class MetadataStorageFromPlainObjectStorage final : public IMetadataStorage
 {
+public:
+    /// Local path prefixes mapped to storage key prefixes.
+    using PathMap = std::unordered_map<std::string, std::string>;
+
 private:
     friend class MetadataStorageFromPlainObjectStorageTransaction;
 
     ObjectStoragePtr object_storage;
     String storage_path_prefix;
 
+    mutable SharedMutex metadata_mutex;
+    std::shared_ptr<PathMap> path_map;
+
 public:
     MetadataStorageFromPlainObjectStorage(ObjectStoragePtr object_storage_, String storage_path_prefix_);
 
@@ -71,21 +79,28 @@ public:
     bool supportsStat() const override { return false; }
 };
 
-class MetadataStorageFromPlainObjectStorageTransaction final : public IMetadataTransaction
+class MetadataStorageFromPlainObjectStorageTransaction final : public IMetadataTransaction, private MetadataOperationsHolder
 {
 private:
-    const MetadataStorageFromPlainObjectStorage & metadata_storage;
+    MetadataStorageFromPlainObjectStorage & metadata_storage;
+    ObjectStoragePtr object_storage;
 
     std::vector<MetadataOperationPtr> operations;
 public:
-    explicit MetadataStorageFromPlainObjectStorageTransaction(const MetadataStorageFromPlainObjectStorage & metadata_storage_)
-        : metadata_storage(metadata_storage_)
+    explicit MetadataStorageFromPlainObjectStorageTransaction(
+        MetadataStorageFromPlainObjectStorage & metadata_storage_, ObjectStoragePtr object_storage_)
+        : metadata_storage(metadata_storage_), object_storage(object_storage_)
     {}
 
     const IMetadataStorage & getStorageForNonTransactionalReads() const override;
 
     void addBlobToMetadata(const std::string & path, ObjectStorageKey object_key, uint64_t size_in_bytes) override;
 
+    void setLastModified(const String &, const Poco::Timestamp &) override
+    {
+        /// Noop
+    }
+
     void createEmptyMetadataFile(const std::string & /* path */) override
     {
         /// No metadata, no need to create anything.
@@ -100,15 +115,14 @@ public:
 
     void createDirectoryRecursive(const std::string & path) override;
 
+    void moveDirectory(const std::string & /* path_from */, const std::string & /* path_to */) override;
+
     void unlinkFile(const std::string & path) override;
     void removeDirectory(const std::string & path) override;
 
     UnlinkMetadataFileOperationOutcomePtr unlinkMetadata(const std::string & path) override;
 
-    void commit() override
-    {
-        /// TODO: rewrite with transactions
-    }
+    void commit() override;
 
     bool supportsChmod() const override { return false; }
 };
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
new file mode 100644
index 00000000000..c3668741244
--- /dev/null
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -0,0 +1,168 @@
+#include "MetadataStorageFromPlainObjectStorageOperations.h"
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+extern const int FILE_DOESNT_EXIST;
+extern const int FILE_ALREADY_EXISTS;
+extern const int INCORRECT_DATA;
+;
+};
+
+namespace
+{
+
+constexpr auto PREFIX_PATH_FILE_NAME = "prefix.path";
+
+}
+
+MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::MetadataStorageFromPlainObjectStorageCreateDirectoryOperation(
+    std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_)
+    : path(std::move(path_)), path_map(path_map_), object_storage(object_storage_)
+{
+}
+
+void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::execute(std::unique_lock<SharedMutex> &)
+{
+    if (path_map.contains(path))
+        return;
+
+    LOG_TRACE(getLogger("MetadataStorageFromPlainObjectStorageCreateDirectoryOperation"), "Creating metadata for directory '{}'", path);
+
+    auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(path / "");
+    auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
+
+    auto object = StoredObject(object_key.serialize(), path / PREFIX_PATH_FILE_NAME);
+    auto buf = object_storage->writeObject(
+        object,
+        WriteMode::Rewrite,
+        /* object_attributes */ std::nullopt,
+        /* buf_size */ 4096,
+        /* settings */ {});
+
+    write_created = true;
+
+    path_map.emplace(path, std::move(key_prefix));
+
+    writeString(path.string(), *buf);
+    buf->finalize();
+}
+
+void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::undo()
+{
+    if (!write_created)
+        return;
+
+    auto node = path_map.extract(path);
+    auto object_key = ObjectStorageKey::createAsRelative(node.mapped(), PREFIX_PATH_FILE_NAME);
+    object_storage->removeObject(StoredObject(object_key.serialize(), path));
+}
+
+MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::MetadataStorageFromPlainObjectStorageMoveDirectoryOperation(
+    std::filesystem::path && path_from_,
+    std::filesystem::path && path_to_,
+    MetadataStorageFromPlainObjectStorage::PathMap & path_map_,
+    ObjectStoragePtr object_storage_)
+    : path_from(std::move(path_from_)), path_to(std::move(path_to_)), path_map(path_map_), object_storage(object_storage_)
+{
+}
+
+std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::write(
+    const std::filesystem::path & from, const std::filesystem::path & to, bool validate_content)
+{
+    auto from_it = path_map.find(from);
+    if (from_it == path_map.end())
+        throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Metadata object for the source path '{}' does not exist", from);
+
+    if (path_map.contains(to))
+        throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Metadata object for the destination path '{}' already exists", to);
+
+    auto object_key = ObjectStorageKey::createAsRelative(from_it->second, PREFIX_PATH_FILE_NAME);
+
+    auto object = StoredObject(object_key.serialize(), path_from / PREFIX_PATH_FILE_NAME);
+
+    if (validate_content)
+    {
+        std::string data;
+        auto readBuf = object_storage->readObject(object, {});
+        readStringUntilEOF(data, *readBuf);
+        chassert(data == path_from);
+        if (data != path_from)
+            throw Exception(
+                ErrorCodes::INCORRECT_DATA,
+                "Incorrect data for object key {}, expected {}, got {}",
+                object_key.serialize(),
+                path_from,
+                data);
+    }
+
+    auto write_buf = object_storage->writeObject(
+        object,
+        WriteMode::Rewrite,
+        std::nullopt,
+        /*buf_size*/ 4096,
+        /*settings*/ {});
+
+    return write_buf;
+}
+
+void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::execute(std::unique_lock<SharedMutex> & /* metadata_lock */)
+{
+    LOG_TRACE(
+        getLogger("MetadataStorageFromPlainObjectStorageMoveDirectoryOperation"), "Moving directory '{}' to '{}'", path_from, path_to);
+
+    auto write_buf = write(path_from, path_to, /* validate_content */ true);
+    write_created = true;
+    writeString(path_to.string(), *write_buf);
+    write_buf->finalize();
+
+    path_map.emplace(path_to, path_map.extract(path_from).mapped());
+    write_finalized = true;
+}
+
+void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::undo()
+{
+    if (write_finalized)
+        path_map.emplace(path_from, path_map.extract(path_to).mapped());
+
+    if (write_created)
+    {
+        auto write_buf = write(path_to, path_from, /* verify_content */ false);
+        writeString(path_from.string(), *write_buf);
+        write_buf->finalize();
+    }
+}
+
+MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation(
+    std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_)
+    : path(std::move(path_)), path_map(path_map_), object_storage(object_storage_)
+{
+}
+
+void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::execute(std::unique_lock<SharedMutex> & /* metadata_lock */)
+{
+    auto path_it = path_map.find(path);
+    if (path_it == path_map.end())
+        return;
+
+    LOG_TRACE(getLogger("MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation"), "Removing directory '{}'", path);
+
+    auto object_key = ObjectStorageKey::createAsRelative(path_it->second, PREFIX_PATH_FILE_NAME);
+    auto object = StoredObject(path_it->second, path / PREFIX_PATH_FILE_NAME);
+    object_storage->removeObject(object);
+    path_map.erase(path_it);
+}
+
+/// TODO
+void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::undo()
+{
+}
+
+}
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
new file mode 100644
index 00000000000..84189a49243
--- /dev/null
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <Disks/ObjectStorages/IMetadataOperation.h>
+#include <Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h>
+
+#include <filesystem>
+#include <map>
+
+namespace DB
+{
+
+class MetadataStorageFromPlainObjectStorageCreateDirectoryOperation final : public IMetadataOperation
+{
+private:
+    std::filesystem::path path;
+    MetadataStorageFromPlainObjectStorage::PathMap & path_map;
+    ObjectStoragePtr object_storage;
+
+    bool write_created = false;
+
+public:
+    // Assuming that paths are normalized.
+    MetadataStorageFromPlainObjectStorageCreateDirectoryOperation(
+        std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_);
+
+    void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
+    void undo() override;
+};
+
+class MetadataStorageFromPlainObjectStorageMoveDirectoryOperation final : public IMetadataOperation
+{
+private:
+    std::filesystem::path path_from;
+    std::filesystem::path path_to;
+    MetadataStorageFromPlainObjectStorage::PathMap & path_map;
+    ObjectStoragePtr object_storage;
+
+    bool write_created = false;
+    bool write_finalized = false;
+
+    std::unique_ptr<WriteBufferFromFileBase>
+    write(const std::filesystem::path & from, const std::filesystem::path & to, bool validate_content);
+
+public:
+    MetadataStorageFromPlainObjectStorageMoveDirectoryOperation(
+        std::filesystem::path && path_from_,
+        std::filesystem::path && path_to_,
+        MetadataStorageFromPlainObjectStorage::PathMap & path_map_,
+        ObjectStoragePtr object_storage_);
+
+    void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
+
+    void undo() override;
+};
+
+class MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation final : public IMetadataOperation
+{
+private:
+    std::filesystem::path path;
+
+    MetadataStorageFromPlainObjectStorage::PathMap & path_map;
+    ObjectStoragePtr object_storage;
+
+public:
+    MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation(
+        std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_);
+
+    void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
+    void undo() override;
+};
+
+}
diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
index 1fec0c4b673..e80bcd85718 100644
--- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
+++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
@@ -1,9 +1,10 @@
 #include "config.h"
 #include <Disks/ObjectStorages/ObjectStorageFactory.h>
 #if USE_AWS_S3
-#include <Disks/ObjectStorages/S3/S3ObjectStorage.h>
-#include <Disks/ObjectStorages/S3/diskSettings.h>
 #include <Disks/ObjectStorages/S3/DiskS3Utils.h>
+#include <Disks/ObjectStorages/S3/S3ObjectStorage.h>
+#include <Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h>
+#include <Disks/ObjectStorages/S3/diskSettings.h>
 #endif
 #if USE_HDFS && !defined(CLICKHOUSE_KEEPER_STANDALONE_BUILD)
 #include <Disks/ObjectStorages/HDFS/HDFSObjectStorage.h>
@@ -210,6 +211,38 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory)
         return object_storage;
     });
 }
+
+void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory)
+{
+    static constexpr auto disk_type = "s3_plain_rewritable";
+
+    factory.registerObjectStorageType(
+        disk_type,
+        [](const std::string & name,
+           const Poco::Util::AbstractConfiguration & config,
+           const std::string & config_prefix,
+           const ContextPtr & context,
+           bool /* skip_access_check */) -> ObjectStoragePtr
+        {
+            /// send_metadata changes the filenames (includes revision), while
+            /// s3_plain do not care about this, and expect that the file name
+            /// will not be changed.
+            if (config.getBool(config_prefix + ".send_metadata", false))
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "s3_plain_rewritable does not supports send_metadata");
+
+            auto uri = getS3URI(config, config_prefix, context);
+            auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix);
+            auto settings = getSettings(config, config_prefix, context);
+            auto client = getClient(config, config_prefix, context, *settings);
+            auto key_generator = getKeyGenerator(uri, config, config_prefix);
+
+            auto object_storage = std::make_shared<S3PlainRewritableObjectStorage>(
+                std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name);
+
+            return object_storage;
+        });
+}
+
 #endif
 
 #if USE_HDFS && !defined(CLICKHOUSE_KEEPER_STANDALONE_BUILD)
@@ -317,6 +350,7 @@ void registerObjectStorages()
 #if USE_AWS_S3
     registerS3ObjectStorage(factory);
     registerS3PlainObjectStorage(factory);
+    registerS3PlainRewritableObjectStorage(factory);
 #endif
 
 #if USE_HDFS && !defined(CLICKHOUSE_KEEPER_STANDALONE_BUILD)
diff --git a/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp b/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp
index 669a0102951..433ff00146b 100644
--- a/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp
@@ -53,6 +53,7 @@ void registerDiskObjectStorage(DiskFactory & factory, bool global_skip_access_ch
 #if USE_AWS_S3
     factory.registerDiskType("s3", creator); /// For compatibility
     factory.registerDiskType("s3_plain", creator); /// For compatibility
+    factory.registerDiskType("s3_plain_rewritable", creator);
 #endif
 #if USE_HDFS
     factory.registerDiskType("hdfs", creator); /// For compatibility
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
index 3211332021e..78847242b04 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@@ -568,10 +568,16 @@ ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string & p
 {
     if (!key_generator)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Key generator is not set");
-    return key_generator->generate(path);
+
+    return key_generator->generate(path, /* is_directory */ false);
 }
 
-
+std::string S3ObjectStorage::generateObjectKeyPrefixForDirectoryPath(const std::string & path) const
+{
+    if (!key_generator)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Key generator is not set");
+    return key_generator->generate(path, /* is_directory */ true).serialize();
+}
 }
 
 #endif
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
index 4ece98c5ec4..2d88195117f 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
@@ -43,8 +43,6 @@ struct S3ObjectStorageSettings
 class S3ObjectStorage : public IObjectStorage
 {
 private:
-    friend class S3PlainObjectStorage;
-
     S3ObjectStorage(
         const char * logger_name,
         std::unique_ptr<S3::Client> && client_,
@@ -54,11 +52,11 @@ private:
         ObjectStorageKeysGeneratorPtr key_generator_,
         const String & disk_name_)
         : uri(uri_)
-        , key_generator(std::move(key_generator_))
         , disk_name(disk_name_)
         , client(std::move(client_))
         , s3_settings(std::move(s3_settings_))
         , s3_capabilities(s3_capabilities_)
+        , key_generator(std::move(key_generator_))
         , log(getLogger(logger_name))
     {
     }
@@ -161,9 +159,12 @@ public:
     bool supportParallelWrite() const override { return true; }
 
     ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override;
+    std::string generateObjectKeyPrefixForDirectoryPath(const std::string & path) const override;
 
     bool isReadOnly() const override { return s3_settings.get()->read_only; }
 
+    void setKeysGenerator(ObjectStorageKeysGeneratorPtr gen) override { key_generator = gen; }
+
 private:
     void setNewSettings(std::unique_ptr<S3ObjectStorageSettings> && s3_settings_);
 
@@ -172,13 +173,14 @@ private:
 
     const S3::URI uri;
 
-    ObjectStorageKeysGeneratorPtr key_generator;
     std::string disk_name;
 
     MultiVersion<S3::Client> client;
     MultiVersion<S3ObjectStorageSettings> s3_settings;
     S3Capabilities s3_capabilities;
 
+    ObjectStorageKeysGeneratorPtr key_generator;
+
     LoggerPtr log;
 };
 
diff --git a/src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h b/src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h
new file mode 100644
index 00000000000..7f43f32370a
--- /dev/null
+++ b/src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <Disks/ObjectStorages/S3/S3ObjectStorage.h>
+
+namespace DB
+{
+
+class S3PlainRewritableObjectStorage : public S3ObjectStorage
+{
+public:
+    template <class... Args>
+    explicit S3PlainRewritableObjectStorage(Args &&... args) : S3ObjectStorage(std::forward<Args>(args)...)
+    {
+    }
+
+    std::string getName() const override { return "S3PlainRewritableObjectStorage"; }
+
+    bool isWriteOnce() const override { return false; }
+
+    bool isPlain() const override { return true; }
+};
+
+}
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 814db3172b4..b3f9167b125 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1672,7 +1672,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional<std::un
             for (auto it = disk_ptr->iterateDirectory(relative_data_path); it->isValid(); it->next())
             {
                 /// Skip temporary directories, file 'format_version.txt' and directory 'detached'.
-                if (startsWith(it->name(), "tmp") || it->name() == MergeTreeData::FORMAT_VERSION_FILE_NAME
+                if (startsWith(it->name(), "tmp") || it->name() == MergeTreeData::FORMAT_VERSION_FILE_NAME || it->name() == "prefix.path"
                     || it->name() == DETACHED_DIR_NAME)
                     continue;
 
diff --git a/tests/queries/0_stateless/03008_s3_plain_rewritable.reference b/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
new file mode 100644
index 00000000000..ee8b8c9fe3d
--- /dev/null
+++ b/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
@@ -0,0 +1 @@
+10000006
diff --git a/tests/queries/0_stateless/03008_s3_plain_rewritable.sql b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
new file mode 100644
index 00000000000..2a7b0b8db4f
--- /dev/null
+++ b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
@@ -0,0 +1,18 @@
+-- Tags: no-fasttest
+-- Tag: no-fasttest -- requires S3
+
+drop table if exists test_s3;
+create table test_s3 (a Int32) engine = MergeTree() order by a
+settings disk=disk(name='s3_plain_rewritable',
+                   type = s3_plain_rewritable,
+                   endpoint = 'http://localhost:11111/test/s3_plain_rewritable/',
+                   access_key_id = clickhouse,
+                   secret_access_key = clickhouse,
+                   send_metadata = false, skip_access_check=true
+            );
+
+insert into test_s3 (*) values (1), (2), (3), (4), (5), (6);
+insert into test_s3 (*) select * from numbers(10000000);
+
+select count(*) from test_s3 LIMIT 10;
+

From 7916792baaca3ae20c59f91b189950c5352f908d Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Mon, 18 Mar 2024 01:01:17 +0000
Subject: [PATCH 098/192] Do not create directory metadata recursively

---
 .../CommonPathPrefixKeyGenerator.cpp          | 36 ++++----
 .../CommonPathPrefixKeyGenerator.h            |  4 +-
 .../MetadataStorageFromPlainObjectStorage.cpp | 85 +++++++++----------
 .../MetadataStorageFromPlainObjectStorage.h   |  5 +-
 4 files changed, 63 insertions(+), 67 deletions(-)

diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
index 7a3c2a7a847..249284761b5 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
@@ -1,6 +1,5 @@
 #include "CommonPathPrefixKeyGenerator.h"
 
-#include <Common/Exception.h>
 #include <Common/getRandomASCIIString.h>
 
 #include <deque>
@@ -9,39 +8,36 @@
 
 namespace DB
 {
-namespace ErrorCodes
-{
-extern const int LOGICAL_ERROR;
-}
 
 CommonPathPrefixKeyGenerator::CommonPathPrefixKeyGenerator(String key_prefix_, std::weak_ptr<PathMap> path_map_)
-    : key_prefix(key_prefix_), path_map(std::move(path_map_))
+    : storage_key_prefix(key_prefix_), path_map(std::move(path_map_))
 {
 }
 
 ObjectStorageKey CommonPathPrefixKeyGenerator::generate(const String & path, bool is_directory) const
 {
-    auto result = getLongestPrefix(path);
+    const auto & [object_key_prefix, suffix_parts] = getLongestObjectKeyPrefix(path);
 
-    const auto & unrealized_parts = std::get<1>(result);
-
-    if (unrealized_parts.size() >= 2)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Can not find object key prefix for path {}", path);
-
-    auto key = std::filesystem::path(std::get<0>(result));
-    if (unrealized_parts.empty())
+    auto key = std::filesystem::path(object_key_prefix.empty() ? storage_key_prefix : object_key_prefix);
+    if (suffix_parts.empty())
         return ObjectStorageKey::createAsRelative(std::move(key));
 
-    constexpr size_t part_size = 8;
-    if (is_directory)
-        key /= getRandomASCIIString(part_size);
+    if (!is_directory || object_key_prefix.empty())
+        for (const auto & part : suffix_parts)
+            key /= part;
     else
-        key /= unrealized_parts.front();
+    {
+        for (size_t i = 0; i + 1 < suffix_parts.size(); ++i)
+            key /= suffix_parts[i];
+
+        constexpr size_t part_size = 16;
+        key /= getRandomASCIIString(part_size);
+    }
 
     return ObjectStorageKey::createAsRelative(key);
 }
 
-std::tuple<std::string, std::vector<std::string>> CommonPathPrefixKeyGenerator::getLongestPrefix(const std::string & path) const
+std::tuple<std::string, std::vector<std::string>> CommonPathPrefixKeyGenerator::getLongestObjectKeyPrefix(const std::string & path) const
 {
     std::filesystem::path p(path);
     std::deque<std::string> dq;
@@ -63,7 +59,7 @@ std::tuple<std::string, std::vector<std::string>> CommonPathPrefixKeyGenerator::
         p = p.parent_path();
     }
 
-    return {key_prefix, std::vector<std::string>(std::make_move_iterator(dq.begin()), std::make_move_iterator(dq.end()))};
+    return {std::string(), std::vector<std::string>(std::make_move_iterator(dq.begin()), std::make_move_iterator(dq.end()))};
 }
 
 }
diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
index e14f45c500f..9eafb60317e 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
@@ -17,9 +17,9 @@ public:
     ObjectStorageKey generate(const String & path, bool is_directory) const override;
 
 private:
-    std::tuple<std::string, std::vector<String>> getLongestPrefix(const String & path) const;
+    std::tuple<std::string, std::vector<String>> getLongestObjectKeyPrefix(const String & path) const;
 
-    String key_prefix;
+    String storage_key_prefix;
     std::weak_ptr<PathMap> path_map;
 };
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index f3dd3daf8e2..c7257487b7c 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -86,7 +86,7 @@ bool MetadataStorageFromPlainObjectStorage::exists(const std::string & path) con
 {
     /// NOTE: exists() cannot be used here since it works only for existing
     /// key, and does not work for some intermediate path.
-    auto object_key = object_storage->generateObjectKeyForPath(path);
+    auto object_key = getObjectKeyForPath(path);
     return object_storage->existsOrHasAnyChild(object_key.serialize());
 }
 
@@ -98,21 +98,9 @@ bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) con
 
 bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path) const
 {
-    auto key_prefix = [&] -> std::optional<std::string>
-    {
-        if (object_storage->isWriteOnce())
-            return object_storage->generateObjectKeyForPath(path).serialize();
+    auto key_prefix = getObjectKeyForPath(path).serialize();
+    auto directory = std::filesystem::path(std::move(key_prefix)) / "";
 
-        auto it = path_map->find(normalizeDirectoryPath(path));
-        return it == path_map->end() ? std::nullopt : std::make_optional(it->second);
-    }();
-
-    if (!key_prefix)
-        return false;
-
-    std::string directory = key_prefix.value();
-    if (!directory.ends_with('/'))
-        directory += '/';
     return object_storage->existsOrHasAnyChild(directory);
 }
 
@@ -152,30 +140,47 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
     const std::string & local_path,
     const MetadataStorageFromPlainObjectStorage::PathMap & local_path_prefixes)
 {
-    /// Subdirectories.
+    using PathMap = MetadataStorageFromPlainObjectStorage::PathMap;
+
     std::unordered_set<std::string> duplicates_filter;
-    for (const auto & [prefix, _] : local_path_prefixes)
+
+    /// Map remote paths into local subdirectories.
+    std::unordered_map<PathMap::mapped_type, PathMap::key_type> reversed;
+    for (const auto & [k, v] : local_path_prefixes)
     {
-        if (!prefix.starts_with(local_path))
+        if (!k.starts_with(local_path))
             continue;
 
-        auto slash_num = count(prefix.begin() + local_path.size(), prefix.end(), '/');
+        auto slash_num = count(k.begin() + local_path.size(), k.end(), '/');
         if (slash_num != 1)
             continue;
 
-        chassert(prefix.back() == '/');
-        duplicates_filter.emplace(std::string(prefix.begin() + local_path.size(), prefix.end() - 1));
+        chassert(k.back() == '/');
+        reversed.emplace(v, std::string(k.begin() + local_path.size(), k.end() - 1));
     }
 
-    /// File names.
     for (const auto & elem : remote_paths)
     {
         const auto & path = elem.relative_path;
         chassert(path.find(storage_key) == 0);
         const auto child_pos = storage_key.size();
 
-        if (path.find('/', child_pos) == std::string::npos)
+        auto slash_pos = path.find('/', child_pos);
+
+        /// File names.
+        if (slash_pos == std::string::npos)
             duplicates_filter.emplace(path.substr(child_pos));
+        /// Subdirectories.
+        else
+        {
+            auto it = reversed.find(path.substr(0, slash_pos));
+            /// Mapped subdirectories.
+            if (it != reversed.end())
+                duplicates_filter.emplace(it->second);
+            /// The remote subdirectory name is the same as the local subdirectory.
+            else
+                duplicates_filter.emplace(path.substr(child_pos, slash_pos - child_pos));
+        }
     }
 
     return std::vector<std::string>(std::make_move_iterator(duplicates_filter.begin()), std::make_move_iterator(duplicates_filter.end()));
@@ -218,6 +223,18 @@ StoredObjects MetadataStorageFromPlainObjectStorage::getStorageObjects(const std
     return {StoredObject(object_key.serialize(), path, object_size)};
 }
 
+ObjectStorageKey MetadataStorageFromPlainObjectStorage::getObjectKeyForPath(const std::string & path) const
+{
+    if (!object_storage->isWriteOnce())
+    {
+        auto it = path_map->find(normalizeDirectoryPath(path));
+        if (it != path_map->end())
+            return ObjectStorageKey::createAsRelative(it->second);
+    }
+
+    return object_storage->generateObjectKeyForPath(path);
+}
+
 const IMetadataStorage & MetadataStorageFromPlainObjectStorageTransaction::getStorageForNonTransactionalReads() const
 {
     return metadata_storage;
@@ -252,27 +269,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std
 
 void MetadataStorageFromPlainObjectStorageTransaction::createDirectoryRecursive(const std::string & path)
 {
-    if (metadata_storage.object_storage->isWriteOnce())
-        return;
-
-    auto p = normalizeDirectoryPath(path);
-    std::vector<std::filesystem::path> paths_created;
-
-    while (true)
-    {
-        paths_created.push_back(p);
-        if (!p.has_parent_path())
-            break;
-
-        p = p.parent_path();
-    }
-
-    for (auto path_to_create : paths_created | std::views::reverse)
-    {
-        auto op = std::make_unique<MetadataStorageFromPlainObjectStorageCreateDirectoryOperation>(
-            path_to_create / "", *metadata_storage.path_map, object_storage);
-        addOperation(std::move(op));
-    }
+    return createDirectory(path);
 }
 
 void MetadataStorageFromPlainObjectStorageTransaction::moveDirectory(const std::string & path_from, const std::string & path_to)
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index cff7d48f368..6427ef2e79a 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -77,6 +77,9 @@ public:
 
     bool supportsChmod() const override { return false; }
     bool supportsStat() const override { return false; }
+
+private:
+    ObjectStorageKey getObjectKeyForPath(const std::string & path) const;
 };
 
 class MetadataStorageFromPlainObjectStorageTransaction final : public IMetadataTransaction, private MetadataOperationsHolder
@@ -86,6 +89,7 @@ private:
     ObjectStoragePtr object_storage;
 
     std::vector<MetadataOperationPtr> operations;
+
 public:
     explicit MetadataStorageFromPlainObjectStorageTransaction(
         MetadataStorageFromPlainObjectStorage & metadata_storage_, ObjectStoragePtr object_storage_)
@@ -126,5 +130,4 @@ public:
 
     bool supportsChmod() const override { return false; }
 };
-
 }

From b4375131cbb4c8361b65af275c62e6ca81551db1 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Wed, 20 Mar 2024 06:45:35 +0000
Subject: [PATCH 099/192] concurrency control

---
 .../CommonPathPrefixKeyGenerator.cpp          |  7 ++-
 .../CommonPathPrefixKeyGenerator.h            |  7 ++-
 .../MetadataStorageFromPlainObjectStorage.cpp | 54 +++++++++----------
 .../MetadataStorageFromPlainObjectStorage.h   |  3 --
 ...torageFromPlainObjectStorageOperations.cpp |  8 +--
 ...aStorageFromPlainObjectStorageOperations.h |  6 ++-
 6 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
index 249284761b5..159be00e9c7 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
@@ -9,8 +9,9 @@
 namespace DB
 {
 
-CommonPathPrefixKeyGenerator::CommonPathPrefixKeyGenerator(String key_prefix_, std::weak_ptr<PathMap> path_map_)
-    : storage_key_prefix(key_prefix_), path_map(std::move(path_map_))
+CommonPathPrefixKeyGenerator::CommonPathPrefixKeyGenerator(
+    String key_prefix_, SharedMutex & shared_mutex_, std::weak_ptr<PathMap> path_map_)
+    : storage_key_prefix(key_prefix_), shared_mutex(shared_mutex_), path_map(std::move(path_map_))
 {
 }
 
@@ -42,6 +43,8 @@ std::tuple<std::string, std::vector<std::string>> CommonPathPrefixKeyGenerator::
     std::filesystem::path p(path);
     std::deque<std::string> dq;
 
+    std::shared_lock lock(shared_mutex);
+
     auto ptr = path_map.lock();
 
     while (p != p.root_path())
diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
index 9eafb60317e..6e13713a7bb 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Common/ObjectStorageKeyGenerator.h>
+#include <Common/SharedMutex.h>
 
 #include <unordered_map>
 
@@ -12,14 +13,16 @@ class CommonPathPrefixKeyGenerator : public IObjectStorageKeysGenerator
 public:
     using PathMap = std::unordered_map<std::string, std::string>;
 
-    explicit CommonPathPrefixKeyGenerator(String key_prefix_, std::weak_ptr<PathMap> path_map_);
+    explicit CommonPathPrefixKeyGenerator(String key_prefix_, SharedMutex & shared_mutex_, std::weak_ptr<PathMap> path_map_);
 
     ObjectStorageKey generate(const String & path, bool is_directory) const override;
 
 private:
     std::tuple<std::string, std::vector<String>> getLongestObjectKeyPrefix(const String & path) const;
 
-    String storage_key_prefix;
+    const String storage_key_prefix;
+
+    SharedMutex & shared_mutex;
     std::weak_ptr<PathMap> path_map;
 };
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index c7257487b7c..a6512af9fe5 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -66,7 +66,7 @@ MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage(Obj
 
     if (!object_storage->isWriteOnce())
     {
-        auto keys_gen = std::make_shared<CommonPathPrefixKeyGenerator>(object_storage->getCommonKeyPrefix(), path_map);
+        auto keys_gen = std::make_shared<CommonPathPrefixKeyGenerator>(object_storage->getCommonKeyPrefix(), metadata_mutex, path_map);
         object_storage->setKeysGenerator(keys_gen);
     }
 }
@@ -86,7 +86,7 @@ bool MetadataStorageFromPlainObjectStorage::exists(const std::string & path) con
 {
     /// NOTE: exists() cannot be used here since it works only for existing
     /// key, and does not work for some intermediate path.
-    auto object_key = getObjectKeyForPath(path);
+    auto object_key = object_storage->generateObjectKeyForPath(path);
     return object_storage->existsOrHasAnyChild(object_key.serialize());
 }
 
@@ -98,7 +98,7 @@ bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) con
 
 bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path) const
 {
-    auto key_prefix = getObjectKeyForPath(path).serialize();
+    auto key_prefix = object_storage->generateObjectKeyForPath(path).serialize();
     auto directory = std::filesystem::path(std::move(key_prefix)) / "";
 
     return object_storage->existsOrHasAnyChild(directory);
@@ -138,7 +138,8 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
     const std::string & storage_key,
     const RelativePathsWithMetadata & remote_paths,
     const std::string & local_path,
-    const MetadataStorageFromPlainObjectStorage::PathMap & local_path_prefixes)
+    const MetadataStorageFromPlainObjectStorage::PathMap & local_path_prefixes,
+    SharedMutex & shared_mutex)
 {
     using PathMap = MetadataStorageFromPlainObjectStorage::PathMap;
 
@@ -146,17 +147,21 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
 
     /// Map remote paths into local subdirectories.
     std::unordered_map<PathMap::mapped_type, PathMap::key_type> reversed;
-    for (const auto & [k, v] : local_path_prefixes)
+
     {
-        if (!k.starts_with(local_path))
-            continue;
+        std::shared_lock lock(shared_mutex);
+        for (const auto & [k, v] : local_path_prefixes)
+        {
+            if (!k.starts_with(local_path))
+                continue;
 
-        auto slash_num = count(k.begin() + local_path.size(), k.end(), '/');
-        if (slash_num != 1)
-            continue;
+            auto slash_num = count(k.begin() + local_path.size(), k.end(), '/');
+            if (slash_num != 1)
+                continue;
 
-        chassert(k.back() == '/');
-        reversed.emplace(v, std::string(k.begin() + local_path.size(), k.end() - 1));
+            chassert(k.back() == '/');
+            reversed.emplace(v, std::string(k.begin() + local_path.size(), k.end() - 1));
+        }
     }
 
     for (const auto & elem : remote_paths)
@@ -189,8 +194,7 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
 
 std::vector<std::string> MetadataStorageFromPlainObjectStorage::listDirectory(const std::string & path) const
 {
-    auto key_prefix = object_storage->isWriteOnce() ? object_storage->generateObjectKeyForPath(path).serialize()
-                                                    : object_storage->generateObjectKeyPrefixForDirectoryPath(path);
+    auto key_prefix = object_storage->generateObjectKeyForPath(path).serialize();
 
     RelativePathsWithMetadata files;
     std::string abs_key = key_prefix;
@@ -202,7 +206,7 @@ std::vector<std::string> MetadataStorageFromPlainObjectStorage::listDirectory(co
     if (object_storage->isWriteOnce())
         return getDirectChildrenOnWriteOnceDisk(abs_key, files);
     else
-        return getDirectChildrenOnRewritableDisk(abs_key, files, path, *path_map);
+        return getDirectChildrenOnRewritableDisk(abs_key, files, path, *path_map, metadata_mutex);
 }
 
 
@@ -223,18 +227,6 @@ StoredObjects MetadataStorageFromPlainObjectStorage::getStorageObjects(const std
     return {StoredObject(object_key.serialize(), path, object_size)};
 }
 
-ObjectStorageKey MetadataStorageFromPlainObjectStorage::getObjectKeyForPath(const std::string & path) const
-{
-    if (!object_storage->isWriteOnce())
-    {
-        auto it = path_map->find(normalizeDirectoryPath(path));
-        if (it != path_map->end())
-            return ObjectStorageKey::createAsRelative(it->second);
-    }
-
-    return object_storage->generateObjectKeyForPath(path);
-}
-
 const IMetadataStorage & MetadataStorageFromPlainObjectStorageTransaction::getStorageForNonTransactionalReads() const
 {
     return metadata_storage;
@@ -249,12 +241,15 @@ void MetadataStorageFromPlainObjectStorageTransaction::unlinkFile(const std::str
 
 void MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(const std::string & path)
 {
-    /// TODO: Make transactional.
+    /// TODO: Should it be handled by the disk transaction?
     for (auto it = metadata_storage.iterateDirectory(path); it->isValid(); it->next())
         metadata_storage.object_storage->removeObject(StoredObject(it->path()));
+
     if (!metadata_storage.object_storage->isWriteOnce())
+    {
         addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation>(
             path, *metadata_storage.path_map, object_storage));
+    }
 }
 
 void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std::string & path)
@@ -262,8 +257,9 @@ void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std
     if (metadata_storage.object_storage->isWriteOnce())
         return;
 
+    auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(std::filesystem::path(path) / "");
     auto op = std::make_unique<MetadataStorageFromPlainObjectStorageCreateDirectoryOperation>(
-        normalizeDirectoryPath(path), *metadata_storage.path_map, object_storage);
+        normalizeDirectoryPath(path), std::move(key_prefix), *metadata_storage.path_map, object_storage);
     addOperation(std::move(op));
 }
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index 6427ef2e79a..b78137f4729 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -77,9 +77,6 @@ public:
 
     bool supportsChmod() const override { return false; }
     bool supportsStat() const override { return false; }
-
-private:
-    ObjectStorageKey getObjectKeyForPath(const std::string & path) const;
 };
 
 class MetadataStorageFromPlainObjectStorageTransaction final : public IMetadataTransaction, private MetadataOperationsHolder
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
index c3668741244..8b28db3bb53 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -24,8 +24,11 @@ constexpr auto PREFIX_PATH_FILE_NAME = "prefix.path";
 }
 
 MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::MetadataStorageFromPlainObjectStorageCreateDirectoryOperation(
-    std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_)
-    : path(std::move(path_)), path_map(path_map_), object_storage(object_storage_)
+    std::filesystem::path && path_,
+    std::string && key_prefix_,
+    MetadataStorageFromPlainObjectStorage::PathMap & path_map_,
+    ObjectStoragePtr object_storage_)
+    : path(std::move(path_)), key_prefix(key_prefix_), path_map(path_map_), object_storage(object_storage_)
 {
 }
 
@@ -36,7 +39,6 @@ void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::execute(std:
 
     LOG_TRACE(getLogger("MetadataStorageFromPlainObjectStorageCreateDirectoryOperation"), "Creating metadata for directory '{}'", path);
 
-    auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(path / "");
     auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
 
     auto object = StoredObject(object_key.serialize(), path / PREFIX_PATH_FILE_NAME);
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
index 84189a49243..c853a3bfd84 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
@@ -13,6 +13,7 @@ class MetadataStorageFromPlainObjectStorageCreateDirectoryOperation final : publ
 {
 private:
     std::filesystem::path path;
+    std::string key_prefix;
     MetadataStorageFromPlainObjectStorage::PathMap & path_map;
     ObjectStoragePtr object_storage;
 
@@ -21,7 +22,10 @@ private:
 public:
     // Assuming that paths are normalized.
     MetadataStorageFromPlainObjectStorageCreateDirectoryOperation(
-        std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_);
+        std::filesystem::path && path_,
+        std::string && key_prefix_,
+        MetadataStorageFromPlainObjectStorage::PathMap & path_map_,
+        ObjectStoragePtr object_storage_);
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
     void undo() override;

From d1e5a09b1896a6092bc2d09a0c72d83d689925a1 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Mon, 25 Mar 2024 00:50:55 +0000
Subject: [PATCH 100/192] better transaction rollback

---
 .../MetadataStorageFromPlainObjectStorage.cpp | 11 ++---
 ...torageFromPlainObjectStorageOperations.cpp | 40 ++++++++++++++-----
 ...aStorageFromPlainObjectStorageOperations.h |  6 ++-
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index a6512af9fe5..56079815987 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -241,11 +241,12 @@ void MetadataStorageFromPlainObjectStorageTransaction::unlinkFile(const std::str
 
 void MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(const std::string & path)
 {
-    /// TODO: Should it be handled by the disk transaction?
-    for (auto it = metadata_storage.iterateDirectory(path); it->isValid(); it->next())
-        metadata_storage.object_storage->removeObject(StoredObject(it->path()));
-
-    if (!metadata_storage.object_storage->isWriteOnce())
+    if (metadata_storage.object_storage->isWriteOnce())
+    {
+        for (auto it = metadata_storage.iterateDirectory(path); it->isValid(); it->next())
+            metadata_storage.object_storage->removeObject(StoredObject(it->path()));
+    }
+    else
     {
         addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation>(
             path, *metadata_storage.path_map, object_storage));
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
index 8b28db3bb53..e74c6487ff4 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -55,16 +55,20 @@ void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::execute(std:
 
     writeString(path.string(), *buf);
     buf->finalize();
+
+    write_finalized = true;
 }
 
 void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::undo()
 {
-    if (!write_created)
-        return;
+    if (write_finalized)
+        path_map.erase(path);
 
-    auto node = path_map.extract(path);
-    auto object_key = ObjectStorageKey::createAsRelative(node.mapped(), PREFIX_PATH_FILE_NAME);
-    object_storage->removeObject(StoredObject(object_key.serialize(), path));
+    if (write_created)
+    {
+        auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
+        object_storage->removeObject(StoredObject(object_key.serialize(), path / PREFIX_PATH_FILE_NAME));
+    }
 }
 
 MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::MetadataStorageFromPlainObjectStorageMoveDirectoryOperation(
@@ -76,7 +80,7 @@ MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::MetadataStorageFrom
 {
 }
 
-std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::write(
+std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::moveObject(
     const std::filesystem::path & from, const std::filesystem::path & to, bool validate_content)
 {
     auto from_it = path_map.find(from);
@@ -95,7 +99,6 @@ std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMo
         std::string data;
         auto readBuf = object_storage->readObject(object, {});
         readStringUntilEOF(data, *readBuf);
-        chassert(data == path_from);
         if (data != path_from)
             throw Exception(
                 ErrorCodes::INCORRECT_DATA,
@@ -120,7 +123,7 @@ void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::execute(std::u
     LOG_TRACE(
         getLogger("MetadataStorageFromPlainObjectStorageMoveDirectoryOperation"), "Moving directory '{}' to '{}'", path_from, path_to);
 
-    auto write_buf = write(path_from, path_to, /* validate_content */ true);
+    auto write_buf = moveObject(path_from, path_to, /* validate_content */ true);
     write_created = true;
     writeString(path_to.string(), *write_buf);
     write_buf->finalize();
@@ -136,7 +139,7 @@ void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::undo()
 
     if (write_created)
     {
-        auto write_buf = write(path_to, path_from, /* verify_content */ false);
+        auto write_buf = moveObject(path_to, path_from, /* verify_content */ false);
         writeString(path_from.string(), *write_buf);
         write_buf->finalize();
     }
@@ -156,15 +159,30 @@ void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::execute(std:
 
     LOG_TRACE(getLogger("MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation"), "Removing directory '{}'", path);
 
-    auto object_key = ObjectStorageKey::createAsRelative(path_it->second, PREFIX_PATH_FILE_NAME);
+    key_prefix = path_it->second;
+    auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
     auto object = StoredObject(path_it->second, path / PREFIX_PATH_FILE_NAME);
     object_storage->removeObject(object);
     path_map.erase(path_it);
 }
 
-/// TODO
 void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::undo()
 {
+    if (!removed)
+        return;
+
+    auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
+    auto object = StoredObject(object_key.serialize(), path / PREFIX_PATH_FILE_NAME);
+    auto buf = object_storage->writeObject(
+        object,
+        WriteMode::Rewrite,
+        /* object_attributes */ std::nullopt,
+        /* buf_size */ 4096,
+        /* settings */ {});
+    writeString(path.string(), *buf);
+    buf->finalize();
+
+    path_map.emplace(path, std::move(key_prefix));
 }
 
 }
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
index c853a3bfd84..d2cbbf5df2a 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
@@ -18,6 +18,7 @@ private:
     ObjectStoragePtr object_storage;
 
     bool write_created = false;
+    bool write_finalized = false;
 
 public:
     // Assuming that paths are normalized.
@@ -43,7 +44,7 @@ private:
     bool write_finalized = false;
 
     std::unique_ptr<WriteBufferFromFileBase>
-    write(const std::filesystem::path & from, const std::filesystem::path & to, bool validate_content);
+    moveObject(const std::filesystem::path & from, const std::filesystem::path & to, bool validate_content);
 
 public:
     MetadataStorageFromPlainObjectStorageMoveDirectoryOperation(
@@ -65,6 +66,9 @@ private:
     MetadataStorageFromPlainObjectStorage::PathMap & path_map;
     ObjectStoragePtr object_storage;
 
+    std::string key_prefix;
+    bool removed = false;
+
 public:
     MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation(
         std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_);

From a67a8299ce3a8a5afc266733b650c5cb44578a8e Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Mon, 1 Apr 2024 21:42:48 +0000
Subject: [PATCH 101/192] Do not list prefix.path in listDirectory

---
 .../MetadataStorageFromPlainObjectStorage.cpp | 29 ++++++++++++-------
 ...torageFromPlainObjectStorageOperations.cpp |  2 +-
 src/Storages/MergeTree/MergeTreeData.cpp      |  2 +-
 3 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index 56079815987..e90bb6685ff 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -20,6 +20,8 @@ namespace DB
 namespace
 {
 
+constexpr auto PREFIX_PATH_FILE_NAME = "prefix.path";
+
 std::filesystem::path normalizePath(const std::filesystem::path & path)
 {
     return std::filesystem::path(path).lexically_normal();
@@ -32,7 +34,6 @@ std::filesystem::path normalizeDirectoryPath(const std::filesystem::path & path)
 
 MetadataStorageFromPlainObjectStorage::PathMap loadPathPrefixMap(const std::string & root, ObjectStoragePtr object_storage)
 {
-    constexpr auto PREFIX_PATH_FILE_NAME = "prefix.path";
     MetadataStorageFromPlainObjectStorage::PathMap result;
 
     RelativePathsWithMetadata files;
@@ -146,7 +147,7 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
     std::unordered_set<std::string> duplicates_filter;
 
     /// Map remote paths into local subdirectories.
-    std::unordered_map<PathMap::mapped_type, PathMap::key_type> reversed;
+    std::unordered_map<PathMap::mapped_type, PathMap::key_type> remote_to_local_subdir;
 
     {
         std::shared_lock lock(shared_mutex);
@@ -160,10 +161,11 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
                 continue;
 
             chassert(k.back() == '/');
-            reversed.emplace(v, std::string(k.begin() + local_path.size(), k.end() - 1));
+            remote_to_local_subdir.emplace(v, std::string(k.begin() + local_path.size(), k.end() - 1));
         }
     }
 
+    auto skip_list = std::set<std::string>{PREFIX_PATH_FILE_NAME};
     for (const auto & elem : remote_paths)
     {
         const auto & path = elem.relative_path;
@@ -172,15 +174,19 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
 
         auto slash_pos = path.find('/', child_pos);
 
-        /// File names.
         if (slash_pos == std::string::npos)
-            duplicates_filter.emplace(path.substr(child_pos));
-        /// Subdirectories.
+        {
+            /// File names.
+            auto filename = path.substr(child_pos);
+            if (!skip_list.contains(filename))
+                duplicates_filter.emplace(std::move(filename));
+        }
         else
         {
-            auto it = reversed.find(path.substr(0, slash_pos));
+            /// Subdirectories.
+            auto it = remote_to_local_subdir.find(path.substr(0, slash_pos));
             /// Mapped subdirectories.
-            if (it != reversed.end())
+            if (it != remote_to_local_subdir.end())
                 duplicates_filter.emplace(it->second);
             /// The remote subdirectory name is the same as the local subdirectory.
             else
@@ -249,7 +255,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(const std
     else
     {
         addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation>(
-            path, *metadata_storage.path_map, object_storage));
+            normalizeDirectoryPath(path), *metadata_storage.path_map, object_storage));
     }
 }
 
@@ -258,9 +264,10 @@ void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std
     if (metadata_storage.object_storage->isWriteOnce())
         return;
 
-    auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(std::filesystem::path(path) / "");
+    auto normalized_path = normalizeDirectoryPath(path);
+    auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(normalized_path);
     auto op = std::make_unique<MetadataStorageFromPlainObjectStorageCreateDirectoryOperation>(
-        normalizeDirectoryPath(path), std::move(key_prefix), *metadata_storage.path_map, object_storage);
+        std::move(normalized_path), std::move(key_prefix), *metadata_storage.path_map, object_storage);
     addOperation(std::move(op));
 }
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
index e74c6487ff4..3cafee047e8 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -161,7 +161,7 @@ void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::execute(std:
 
     key_prefix = path_it->second;
     auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
-    auto object = StoredObject(path_it->second, path / PREFIX_PATH_FILE_NAME);
+    auto object = StoredObject(object_key.serialize(), path / PREFIX_PATH_FILE_NAME);
     object_storage->removeObject(object);
     path_map.erase(path_it);
 }
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index b3f9167b125..814db3172b4 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1672,7 +1672,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional<std::un
             for (auto it = disk_ptr->iterateDirectory(relative_data_path); it->isValid(); it->next())
             {
                 /// Skip temporary directories, file 'format_version.txt' and directory 'detached'.
-                if (startsWith(it->name(), "tmp") || it->name() == MergeTreeData::FORMAT_VERSION_FILE_NAME || it->name() == "prefix.path"
+                if (startsWith(it->name(), "tmp") || it->name() == MergeTreeData::FORMAT_VERSION_FILE_NAME
                     || it->name() == DETACHED_DIR_NAME)
                     continue;
 

From cb3cf73be20e55b28b81fab0260004bdc7979a27 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Wed, 27 Mar 2024 02:12:26 +0000
Subject: [PATCH 102/192] more tests

---
 .../03008_s3_plain_rewritable.reference       | 12 +++++++-
 .../0_stateless/03008_s3_plain_rewritable.sql | 28 +++++++++++--------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/tests/queries/0_stateless/03008_s3_plain_rewritable.reference b/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
index ee8b8c9fe3d..35c782ee432 100644
--- a/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
+++ b/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
@@ -1 +1,11 @@
-10000006
+1000006
+0	0
+1	1
+1	2
+2	2
+2	2
+3	1
+3	3
+4	4
+4	7
+5	5
diff --git a/tests/queries/0_stateless/03008_s3_plain_rewritable.sql b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
index 2a7b0b8db4f..345820b69a6 100644
--- a/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
+++ b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
@@ -1,18 +1,22 @@
 -- Tags: no-fasttest
 -- Tag: no-fasttest -- requires S3
 
-drop table if exists test_s3;
-create table test_s3 (a Int32) engine = MergeTree() order by a
-settings disk=disk(name='s3_plain_rewritable',
-                   type = s3_plain_rewritable,
-                   endpoint = 'http://localhost:11111/test/s3_plain_rewritable/',
-                   access_key_id = clickhouse,
-                   secret_access_key = clickhouse,
-                   send_metadata = false, skip_access_check=true
-            );
+drop table if exists test_mt;
+create table test_mt (a Int32, b Int64) engine = MergeTree() order by a
+settings disk = disk(
+    type = s3_plain_rewritable,
+    endpoint = 'http://localhost:11111/test/test_mt/',
+    access_key_id = clickhouse,
+    secret_access_key = clickhouse);
 
-insert into test_s3 (*) values (1), (2), (3), (4), (5), (6);
-insert into test_s3 (*) select * from numbers(10000000);
+insert into test_mt (*) values (1, 2), (2, 2), (3, 1), (4, 7), (5, 10), (6, 12);
+insert into test_mt (*) select number, number from numbers_mt(1000000);
 
-select count(*) from test_s3 LIMIT 10;
+select count(*) from test_mt;
+select (*) from test_mt order by tuple(a, b) limit 10;
 
+-- File moving is not supported.
+alter table test_mt update b = 0 where a % 2 = 1; --{ serverError NOT_IMPLEMENTED }
+
+alter table test_mt add column c Int64 after b; --{ serverError BAD_GET }
+alter table test_mt drop column b; --{ serverError BAD_GET }

From a3ce6162c44a48caea4e2e59087925ffc77044f5 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Thu, 28 Mar 2024 03:48:57 +0000
Subject: [PATCH 103/192] add integration test

---
 .../test_s3_plain_rewritable/__init__.py      |  0
 .../configs/storage_conf.xml                  | 21 +++++
 .../test_s3_plain_rewritable/test.py          | 78 +++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 tests/integration/test_s3_plain_rewritable/__init__.py
 create mode 100644 tests/integration/test_s3_plain_rewritable/configs/storage_conf.xml
 create mode 100644 tests/integration/test_s3_plain_rewritable/test.py

diff --git a/tests/integration/test_s3_plain_rewritable/__init__.py b/tests/integration/test_s3_plain_rewritable/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_s3_plain_rewritable/configs/storage_conf.xml b/tests/integration/test_s3_plain_rewritable/configs/storage_conf.xml
new file mode 100644
index 00000000000..1e4641fc8b2
--- /dev/null
+++ b/tests/integration/test_s3_plain_rewritable/configs/storage_conf.xml
@@ -0,0 +1,21 @@
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <disk_s3_plain_rewritable>
+                <type>s3_plain_rewritable</type>
+                <endpoint>http://minio1:9001/root/data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+            </disk_s3_plain_rewritable>
+        </disks>
+        <policies>
+            <s3_plain_rewritable>
+                <volumes>
+                    <main>
+                        <disk>disk_s3_plain_rewritable</disk>
+                    </main>
+                </volumes>
+            </s3_plain_rewritable>
+        </policies>
+    </storage_configuration>
+</clickhouse>
diff --git a/tests/integration/test_s3_plain_rewritable/test.py b/tests/integration/test_s3_plain_rewritable/test.py
new file mode 100644
index 00000000000..ac35da01897
--- /dev/null
+++ b/tests/integration/test_s3_plain_rewritable/test.py
@@ -0,0 +1,78 @@
+import pytest
+import random
+import string
+
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+node = cluster.add_instance(
+    "node",
+    main_configs=["configs/storage_conf.xml"],
+    with_minio=True,
+    stay_alive=True,
+)
+
+insert_values = [
+    "(0,'data'),(1,'data')",
+    ",".join(
+        f"({i},'{''.join(random.choices(string.ascii_lowercase, k=5))}')"
+        for i in range(10)
+    ),
+]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def test_insert():
+    for index, value in enumerate(insert_values):
+        node.query(
+            """
+            CREATE TABLE test_{} (
+                id Int64,
+                data String
+            ) ENGINE=MergeTree()
+            ORDER BY id
+            SETTINGS storage_policy='s3_plain_rewritable'
+            """.format(
+                index
+            )
+        )
+
+        node.query("INSERT INTO test_{} VALUES {}".format(index, value))
+        assert (
+            node.query("SELECT * FROM test_{} ORDER BY id FORMAT Values".format(index))
+            == value
+        )
+
+
+def test_restart():
+    for index, value in enumerate(insert_values):
+        assert (
+            node.query("SELECT * FROM test_{} ORDER BY id FORMAT Values".format(index))
+            == value
+        )
+    node.restart_clickhouse()
+
+    for index, value in enumerate(insert_values):
+        assert (
+            node.query("SELECT * FROM test_{} ORDER BY id FORMAT Values".format(index))
+            == value
+        )
+
+
+def test_drop():
+    for index, value in enumerate(insert_values):
+        node.query("DROP TABLE IF EXISTS test_{} SYNC".format(index))
+
+    it = cluster.minio_client.list_objects(
+        cluster.minio_bucket, "data/", recursive=True
+    )
+
+    assert len(list(it)) == 0

From 5e716bcfb894ca3797b346ce12b52ec8604c2221 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Sun, 7 Apr 2024 06:07:12 +0000
Subject: [PATCH 104/192] extend system logs integration tests

---
 .../configs/config.d/disks.xml                | 15 +++++++++-
 ...logs_engine_s3_plain_rewritable_policy.xml |  6 ++++
 .../test_system_logs/test_system_logs.py      | 28 +++++++++++++++++++
 3 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 tests/integration/test_system_logs/configs/config.d/system_logs_engine_s3_plain_rewritable_policy.xml

diff --git a/tests/integration/test_system_logs/configs/config.d/disks.xml b/tests/integration/test_system_logs/configs/config.d/disks.xml
index 90a1b110326..fc41e62eb4b 100644
--- a/tests/integration/test_system_logs/configs/config.d/disks.xml
+++ b/tests/integration/test_system_logs/configs/config.d/disks.xml
@@ -7,6 +7,12 @@
             <disk2>
                 <path>/var/lib/clickhouse2/</path>
             </disk2>
+            <disk3>
+                <type>s3_plain_rewritable</type>
+                <endpoint>http://minio1:9001/root/data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+            </disk3>
         </disks>
         <policies>
             <policy1>
@@ -23,6 +29,13 @@
                     </volume1>
                 </volumes>
             </policy2>
+            <s3_plain_rewritable>
+                <volumes>
+                    <volume1>
+                        <disk>disk3</disk>
+                    </volume1>
+                </volumes>
+            </s3_plain_rewritable>
         </policies>
     </storage_configuration>
-</clickhouse>
\ No newline at end of file
+</clickhouse>
diff --git a/tests/integration/test_system_logs/configs/config.d/system_logs_engine_s3_plain_rewritable_policy.xml b/tests/integration/test_system_logs/configs/config.d/system_logs_engine_s3_plain_rewritable_policy.xml
new file mode 100644
index 00000000000..3c14e130f61
--- /dev/null
+++ b/tests/integration/test_system_logs/configs/config.d/system_logs_engine_s3_plain_rewritable_policy.xml
@@ -0,0 +1,6 @@
+
+<clickhouse>
+    <query_log>
+        <engine>Engine = MergeTree PARTITION BY event_date ORDER BY event_time TTL event_date + INTERVAL 30 day SETTINGS storage_policy='s3_plain_rewritable', ttl_only_drop_parts=1</engine>
+    </query_log>
+</clickhouse>
diff --git a/tests/integration/test_system_logs/test_system_logs.py b/tests/integration/test_system_logs/test_system_logs.py
index 72249cd64ee..1c45e69957b 100644
--- a/tests/integration/test_system_logs/test_system_logs.py
+++ b/tests/integration/test_system_logs/test_system_logs.py
@@ -35,6 +35,18 @@ node3 = cluster.add_instance(
 )
 
 
+node4 = cluster.add_instance(
+    "node4",
+    base_config_dir="configs",
+    main_configs=[
+        "configs/config.d/system_logs_engine_s3_plain_rewritable_policy.xml",
+        "configs/config.d/disks.xml",
+    ],
+    with_minio=True,
+    stay_alive=True,
+)
+
+
 @pytest.fixture(scope="module", autouse=True)
 def start_cluster():
     try:
@@ -78,6 +90,22 @@ def test_system_logs_engine_expr(start_cluster):
     )
 
 
+def test_system_logs_engine_s3_plain_rw_expr(start_cluster):
+    node4.query("SET log_query_threads = 1")
+    node4.query("SELECT count() FROM system.tables")
+    node4.query("SYSTEM FLUSH LOGS")
+
+    # Check 'engine_full' of system.query_log.
+    expected = "MergeTree PARTITION BY event_date ORDER BY event_time TTL event_date + toIntervalDay(30) SETTINGS storage_policy = \\'s3_plain_rewritable\\', ttl_only_drop_parts = 1"
+    assert expected in node4.query(
+        "SELECT engine_full FROM system.tables WHERE database='system' and name='query_log'"
+    )
+    node4.restart_clickhouse()
+    assert expected in node4.query(
+        "SELECT engine_full FROM system.tables WHERE database='system' and name='query_log'"
+    )
+
+
 def test_system_logs_settings_expr(start_cluster):
     node3.query("SET log_query_threads = 1")
     node3.query("SELECT count() FROM system.tables")

From 8c99b0d5eb7bb850da5a3750954f7847eb61ca20 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Mon, 8 Apr 2024 22:40:22 +0000
Subject: [PATCH 105/192] docs

---
 .../table-engines/mergetree-family/mergetree.md |  1 +
 docs/en/operations/storing-data.md              | 17 +++++++++++++++++
 .../aspell-ignore/en/aspell-dict.txt            |  1 +
 3 files changed, 19 insertions(+)

diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md
index 1bbd995d189..886c29e755e 100644
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@@ -769,6 +769,7 @@ In addition to local block devices, ClickHouse supports these storage types:
 - [`web` for read-only from web](#web-storage)
 - [`cache` for local caching](/docs/en/operations/storing-data.md/#using-local-cache)
 - [`s3_plain` for backups to S3](/docs/en/operations/backup#backuprestore-using-an-s3-disk)
+- [`s3_plain_rewritable` for immutable, non-replicated tables in S3](/docs/en/operations/storing-data.md#s3-plain-rewritable-storage)
 
 ## Using Multiple Block Devices for Data Storage {#table_engine-mergetree-multiple-volumes}
 
diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index 2c642dd2f0b..a402a731933 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -341,6 +341,23 @@ Configuration:
 </s3_plain>
 ```
 
+### Using S3 Plain Rewritable Storage {#s3-plain-rewritable-storage}
+A new disk type `s3_plain_rewritable` was introduced in `24.4`.
+Similar to the `s3_plain` disk type, it does not require additional storage for metadata files; instead, metadata is stored in S3.
+Unlike `s3_plain` disk type, `s3_plain_rewritable` allows executing merges and supports INSERT operations.
+[Mutations](/docs/en/sql-reference/statements/alter#mutations) and replication of tables are not supported.
+
+A use case for this disk type are non-replicated `MergeTree` tables. e.g., system tables.
+
+Configuration:
+``` xml
+<s3_plain_rewritable>
+    <type>s3_plain_rewritable</type>
+    <endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
+    <use_environment_credentials>1</use_environment_credentials>
+</s3_plain_rewritable>
+```
+
 ### Using Azure Blob Storage {#azure-blob-storage}
 
 `MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`.
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index 54ff4d1d0ad..0fcc5dd4954 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -2321,6 +2321,7 @@ retentions
 rethrow
 retransmit
 retriable
+rewritable
 reverseUTF
 rightPad
 rightPadUTF

From 01ee500b066f6498da37705814e224d49995829c Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Mon, 8 Apr 2024 23:11:18 +0000
Subject: [PATCH 106/192] improvements, cleanups, comments

---
 .../CommonPathPrefixKeyGenerator.cpp          |  4 ++++
 .../CommonPathPrefixKeyGenerator.h            | 10 ++++++++
 .../ObjectStorages/MetadataOperationsHolder.h |  5 +++-
 .../MetadataStorageFromPlainObjectStorage.cpp | 23 +++++++++++++------
 ...torageFromPlainObjectStorageOperations.cpp | 10 ++++----
 .../ObjectStorages/ObjectStorageFactory.cpp   |  9 +++++---
 6 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
index 159be00e9c7..e321c8a3c5a 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.cpp
@@ -20,12 +20,16 @@ ObjectStorageKey CommonPathPrefixKeyGenerator::generate(const String & path, boo
     const auto & [object_key_prefix, suffix_parts] = getLongestObjectKeyPrefix(path);
 
     auto key = std::filesystem::path(object_key_prefix.empty() ? storage_key_prefix : object_key_prefix);
+
+    /// The longest prefix is the same as path, meaning that the  path is already mapped.
     if (suffix_parts.empty())
         return ObjectStorageKey::createAsRelative(std::move(key));
 
+    /// File and top-level directory paths are mapped as is.
     if (!is_directory || object_key_prefix.empty())
         for (const auto & part : suffix_parts)
             key /= part;
+    /// Replace the last part of the directory path with a pseudorandom suffix.
     else
     {
         for (size_t i = 0; i + 1 < suffix_parts.size(); ++i)
diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
index 6e13713a7bb..c0622c3e63a 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
@@ -8,6 +8,15 @@
 namespace DB
 {
 
+/// Object storage key generator used specifically with the
+/// MetadataStorageFromPlainObjectStorage if multiple writes are allowed.
+
+/// It searches for the local (metadata) path in a pre-loaded path map.
+/// If no such path exists, it searches for the parent path, until it is found
+/// or no parent path exists.
+///
+/// The key generator ensures that the original directory hierarchy is
+/// preserved, which is required for the MergeTree family.
 class CommonPathPrefixKeyGenerator : public IObjectStorageKeysGenerator
 {
 public:
@@ -18,6 +27,7 @@ public:
     ObjectStorageKey generate(const String & path, bool is_directory) const override;
 
 private:
+    /// Longest key prefix and unresolved parts of the source path.
     std::tuple<std::string, std::vector<String>> getLongestObjectKeyPrefix(const String & path) const;
 
     const String storage_key_prefix;
diff --git a/src/Disks/ObjectStorages/MetadataOperationsHolder.h b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
index dc090675e4e..e264df5e094 100644
--- a/src/Disks/ObjectStorages/MetadataOperationsHolder.h
+++ b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
@@ -1,10 +1,13 @@
 #pragma once
 
 #include <Disks/ObjectStorages/IMetadataOperation.h>
+
+/// TODO: rename to MetadataStorageTransactionState.
 #include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
 
 /**
- * Implementations for transactional operations with metadata used by MetadataStorageFromDisk.
+ * Implementations for transactional operations with metadata used by MetadataStorageFromDisk
+ * and MetadataStorageFromPlainObjectStorage.
  */
 
 namespace DB
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index e90bb6685ff..61bbfc07b79 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -46,11 +46,23 @@ MetadataStorageFromPlainObjectStorage::PathMap loadPathPrefixMap(const std::stri
 
         StoredObject object{file.relative_path};
 
-        auto read_buf = object_storage->readObject(object, {});
-        String content;
-        readString(content, *read_buf);
+        auto read_buf = object_storage->readObject(object);
+        String local_path;
+        readStringUntilEOF(local_path, *read_buf);
 
-        result.emplace(content, remote_path.parent_path().string());
+        chassert(remote_path.has_parent_path());
+        auto res = result.emplace(local_path, remote_path.parent_path());
+
+        /// This can happen if table replication is enabled, then the same local path is written
+        /// in `prefix.path` of each replica.
+        /// TODO: should replicated tables (e.g., RMT) be explicitly disallowed?
+        if (!res.second)
+            LOG_WARNING(
+                getLogger("MetadataStorageFromPlainObjectStorage"),
+                "The local path '{}' is already mapped to a remote path '{}', ignoring: '{}'",
+                local_path,
+                res.first->second,
+                remote_path.parent_path().string());
     }
     return result;
 }
@@ -62,9 +74,6 @@ MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage(Obj
     , storage_path_prefix(std::move(storage_path_prefix_))
     , path_map(std::make_shared<PathMap>(loadPathPrefixMap(object_storage->getCommonKeyPrefix(), object_storage)))
 {
-    LOG_TRACE(
-        getLogger("MetadataStorageFromPlainObjectStorage"), "MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage");
-
     if (!object_storage->isWriteOnce())
     {
         auto keys_gen = std::make_shared<CommonPathPrefixKeyGenerator>(object_storage->getCommonKeyPrefix(), metadata_mutex, path_map);
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
index 3cafee047e8..827e2cd6fb8 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -46,7 +46,7 @@ void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::execute(std:
         object,
         WriteMode::Rewrite,
         /* object_attributes */ std::nullopt,
-        /* buf_size */ 4096,
+        /* buf_size */ DBMS_DEFAULT_BUFFER_SIZE,
         /* settings */ {});
 
     write_created = true;
@@ -97,7 +97,7 @@ std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMo
     if (validate_content)
     {
         std::string data;
-        auto readBuf = object_storage->readObject(object, {});
+        auto readBuf = object_storage->readObject(object);
         readStringUntilEOF(data, *readBuf);
         if (data != path_from)
             throw Exception(
@@ -111,8 +111,8 @@ std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMo
     auto write_buf = object_storage->writeObject(
         object,
         WriteMode::Rewrite,
-        std::nullopt,
-        /*buf_size*/ 4096,
+        /* object_attributes */ std::nullopt,
+        /*buf_size*/ DBMS_DEFAULT_BUFFER_SIZE,
         /*settings*/ {});
 
     return write_buf;
@@ -177,7 +177,7 @@ void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::undo()
         object,
         WriteMode::Rewrite,
         /* object_attributes */ std::nullopt,
-        /* buf_size */ 4096,
+        /* buf_size */ DBMS_DEFAULT_BUFFER_SIZE,
         /* settings */ {});
     writeString(path.string(), *buf);
     buf->finalize();
diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
index e80bcd85718..958c6400a7d 100644
--- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
+++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
@@ -222,11 +222,10 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory)
            const Poco::Util::AbstractConfiguration & config,
            const std::string & config_prefix,
            const ContextPtr & context,
-           bool /* skip_access_check */) -> ObjectStoragePtr
+           bool skip_access_check) -> ObjectStoragePtr
         {
             /// send_metadata changes the filenames (includes revision), while
-            /// s3_plain do not care about this, and expect that the file name
-            /// will not be changed.
+            /// s3_plain_rewritable does not support file renaming.
             if (config.getBool(config_prefix + ".send_metadata", false))
                 throw Exception(ErrorCodes::BAD_ARGUMENTS, "s3_plain_rewritable does not supports send_metadata");
 
@@ -239,6 +238,10 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory)
             auto object_storage = std::make_shared<S3PlainRewritableObjectStorage>(
                 std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name);
 
+            /// NOTE: should we still perform this check for clickhouse-disks?
+            if (!skip_access_check)
+                checkS3Capabilities(*dynamic_cast<S3ObjectStorage *>(object_storage.get()), s3_capabilities, name);
+
             return object_storage;
         });
 }

From 89f28f3c18f6d8d19b0f5d59c12548674d20bdda Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Wed, 24 Apr 2024 20:50:14 +0000
Subject: [PATCH 107/192] explicitly disallow ALTERs and mutations for plain

---
 src/Disks/IDisk.h                             |  3 ++
 .../ObjectStorages/DiskObjectStorage.cpp      |  5 ++++
 src/Disks/ObjectStorages/DiskObjectStorage.h  |  2 ++
 src/Storages/MergeTree/MergeTreeData.cpp      | 13 +++++++-
 .../03008_s3_plain_rewritable.reference       | 22 +++++++-------
 .../0_stateless/03008_s3_plain_rewritable.sql | 30 ++++++++++++++-----
 6 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index fd5298588c5..83cf3d53057 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -363,6 +363,9 @@ public:
 
     virtual bool isWriteOnce() const { return false; }
 
+    /// Whether this disk support mutations.
+    virtual bool isMutable() const { return true; }
+
     /// Check if disk is broken. Broken disks will have 0 space and cannot be used.
     virtual bool isBroken() const { return false; }
 
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
index 06abec71567..70eb0ec6706 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@@ -435,6 +435,11 @@ bool DiskObjectStorage::isWriteOnce() const
     return object_storage->isWriteOnce();
 }
 
+bool DiskObjectStorage::isMutable() const
+{
+    return !isWriteOnce() && !object_storage->isPlain();
+}
+
 DiskObjectStoragePtr DiskObjectStorage::createDiskObjectStorage()
 {
     const auto config_prefix = "storage_configuration.disks." + name;
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h
index 9702573b875..8a7e52cee6a 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.h
@@ -183,6 +183,8 @@ public:
     /// MergeTree table on this disk.
     bool isWriteOnce() const override;
 
+    bool isMutable() const override;
+
     /// Get structure of object storage this disk works with. Examples:
     /// DiskObjectStorage(S3ObjectStorage)
     /// DiskObjectStorage(CachedObjectStorage(S3ObjectStorage))
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 814db3172b4..c7c2b940152 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -2964,6 +2964,10 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
         throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
                 "Experimental Inverted Index feature is not enabled (turn on setting 'allow_experimental_inverted_index')");
 
+    for (const auto & disk : getDisks())
+        if (!disk->isMutable())
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ALTER TABLE is not supported for immutable disk '{}'", disk->getName());
+
     /// Set of columns that shouldn't be altered.
     NameSet columns_alter_type_forbidden;
 
@@ -3334,7 +3338,9 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
 
 void MergeTreeData::checkMutationIsPossible(const MutationCommands & /*commands*/, const Settings & /*settings*/) const
 {
-    /// Some validation will be added
+    for (const auto & disk : getDisks())
+        if (!disk->isMutable())
+            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Mutations are not supported for immutable disk '{}'", disk->getName());
 }
 
 MergeTreeDataPartFormat MergeTreeData::choosePartFormat(size_t bytes_uncompressed, size_t rows_count) const
@@ -4824,6 +4830,11 @@ void MergeTreeData::removePartContributionToColumnAndSecondaryIndexSizes(const D
 void MergeTreeData::checkAlterPartitionIsPossible(
     const PartitionCommands & commands, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & settings, ContextPtr local_context) const
 {
+    for (const auto & disk : getDisks())
+        if (!disk->isMutable())
+            throw Exception(
+                ErrorCodes::SUPPORT_IS_DISABLED, "ALTER TABLE PARTITION is not supported for immutable disk '{}'", disk->getName());
+
     for (const auto & command : commands)
     {
         if (command.type == PartitionCommand::DROP_DETACHED_PARTITION
diff --git a/tests/queries/0_stateless/03008_s3_plain_rewritable.reference b/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
index 35c782ee432..9aa9873514a 100644
--- a/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
+++ b/tests/queries/0_stateless/03008_s3_plain_rewritable.reference
@@ -1,11 +1,11 @@
-1000006
-0	0
-1	1
-1	2
-2	2
-2	2
-3	1
-3	3
-4	4
-4	7
-5	5
+10006
+0	0	0
+1	1	1
+1	2	0
+2	2	2
+2	2	2
+3	1	9
+3	3	3
+4	4	4
+4	7	7
+5	5	5
diff --git a/tests/queries/0_stateless/03008_s3_plain_rewritable.sql b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
index 345820b69a6..1bfd8118875 100644
--- a/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
+++ b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
@@ -2,21 +2,37 @@
 -- Tag: no-fasttest -- requires S3
 
 drop table if exists test_mt;
-create table test_mt (a Int32, b Int64) engine = MergeTree() order by a
+create table test_mt (a Int32, b Int64, c Int64) engine = MergeTree() partition by intDiv(a, 500) order by tuple(a, b)
 settings disk = disk(
     type = s3_plain_rewritable,
     endpoint = 'http://localhost:11111/test/test_mt/',
     access_key_id = clickhouse,
     secret_access_key = clickhouse);
 
-insert into test_mt (*) values (1, 2), (2, 2), (3, 1), (4, 7), (5, 10), (6, 12);
-insert into test_mt (*) select number, number from numbers_mt(1000000);
+insert into test_mt (*) values (1, 2, 0), (2, 2, 2), (3, 1, 9), (4, 7, 7), (5, 10, 2), (6, 12, 5);
+insert into test_mt (*) select number, number, number from numbers_mt(10000);
 
 select count(*) from test_mt;
 select (*) from test_mt order by tuple(a, b) limit 10;
 
--- File moving is not supported.
-alter table test_mt update b = 0 where a % 2 = 1; --{ serverError NOT_IMPLEMENTED }
+optimize table test_mt final;
 
-alter table test_mt add column c Int64 after b; --{ serverError BAD_GET }
-alter table test_mt drop column b; --{ serverError BAD_GET }
+alter table test_mt add projection test_mt_projection (
+    select * order by b); -- { serverError SUPPORT_IS_DISABLED }
+
+alter table test_mt update c = 0 where a % 2 = 1; -- { serverError SUPPORT_IS_DISABLED }
+alter table test_mt add column d Int64 after c; -- { serverError SUPPORT_IS_DISABLED }
+alter table test_mt drop column c; -- { serverError SUPPORT_IS_DISABLED }
+
+detach table test_mt;
+attach table test_mt;
+
+drop table if exists test_mt_dst;
+
+create table test_mt_dst (a Int32, b Int64, c Int64) engine = MergeTree() partition by intDiv(a, 500) order by tuple(a, b)
+settings disk = disk(
+    type = s3_plain_rewritable,
+    endpoint = 'http://localhost:11111/test/test_mt/',
+    access_key_id = clickhouse,
+    secret_access_key = clickhouse);
+alter table test_mt move partition 0 to table test_mt_dst; -- { serverError SUPPORT_IS_DISABLED }

From 70d55aa618e97227cba6d9b377f829314528dea7 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Thu, 25 Apr 2024 16:54:44 -0700
Subject: [PATCH 108/192] Update
 src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h

Co-authored-by: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
---
 src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
index c0622c3e63a..68a390a51b6 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
@@ -20,6 +20,7 @@ namespace DB
 class CommonPathPrefixKeyGenerator : public IObjectStorageKeysGenerator
 {
 public:
+    /// Local to remote path map.
     using PathMap = std::unordered_map<std::string, std::string>;
 
     explicit CommonPathPrefixKeyGenerator(String key_prefix_, SharedMutex & shared_mutex_, std::weak_ptr<PathMap> path_map_);

From 4f6a3e27b79c0f599edc5f3586878d563b367460 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Thu, 25 Apr 2024 16:55:29 -0700
Subject: [PATCH 109/192] Update
 src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp

Co-authored-by: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
---
 .../MetadataStorageFromPlainObjectStorageOperations.cpp          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
index 827e2cd6fb8..2ee6961274d 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -13,7 +13,6 @@ namespace ErrorCodes
 extern const int FILE_DOESNT_EXIST;
 extern const int FILE_ALREADY_EXISTS;
 extern const int INCORRECT_DATA;
-;
 };
 
 namespace

From 36a1cae9102e0c7e8e894e5a1ef8316e678c6fd2 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Fri, 26 Apr 2024 00:09:12 +0000
Subject: [PATCH 110/192] address feedback - pt.1

---
 src/Disks/ObjectStorages/IMetadataStorage.h   |  2 +-
 src/Disks/ObjectStorages/IObjectStorage.h     |  2 +-
 .../MetadataStorageFromPlainObjectStorage.cpp |  2 +-
 .../MetadataStorageFromPlainObjectStorage.h   |  2 +-
 ...torageFromPlainObjectStorageOperations.cpp | 43 ++++++++++---------
 ...aStorageFromPlainObjectStorageOperations.h |  2 +-
 .../test_s3_plain_rewritable/test.py          |  3 ++
 7 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/src/Disks/ObjectStorages/IMetadataStorage.h b/src/Disks/ObjectStorages/IMetadataStorage.h
index f95db2e1eee..af026bdb095 100644
--- a/src/Disks/ObjectStorages/IMetadataStorage.h
+++ b/src/Disks/ObjectStorages/IMetadataStorage.h
@@ -145,7 +145,7 @@ public:
 
     virtual ~IMetadataTransaction() = default;
 
-private:
+protected:
     [[noreturn]] static void throwNotImplemented()
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Operation is not implemented");
diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h
index 8ac4e609ffa..e2f041e3715 100644
--- a/src/Disks/ObjectStorages/IObjectStorage.h
+++ b/src/Disks/ObjectStorages/IObjectStorage.h
@@ -212,7 +212,7 @@ public:
     virtual ObjectStorageKey generateObjectKeyForPath(const std::string & path) const = 0;
     virtual std::string generateObjectKeyPrefixForDirectoryPath(const std::string & /* path */) const
     {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented");
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'generateObjectKeyPrefixForDirectoryPath' is not implemented");
     }
 
     /// Get unique id for passed absolute path in object storage.
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index 61bbfc07b79..54cc96effee 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -288,7 +288,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::createDirectoryRecursive(
 void MetadataStorageFromPlainObjectStorageTransaction::moveDirectory(const std::string & path_from, const std::string & path_to)
 {
     if (metadata_storage.object_storage->isWriteOnce())
-        return;
+        throwNotImplemented();
 
     addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageMoveDirectoryOperation>(
         normalizeDirectoryPath(path_from), normalizeDirectoryPath(path_to), *metadata_storage.path_map, object_storage));
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index b78137f4729..3820fd893b5 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -116,7 +116,7 @@ public:
 
     void createDirectoryRecursive(const std::string & path) override;
 
-    void moveDirectory(const std::string & /* path_from */, const std::string & /* path_to */) override;
+    void moveDirectory(const std::string & path_from, const std::string & path_to) override;
 
     void unlinkFile(const std::string & path) override;
     void removeDirectory(const std::string & path) override;
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
index 2ee6961274d..c3fa4ef1ec5 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -50,7 +50,8 @@ void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::execute(std:
 
     write_created = true;
 
-    path_map.emplace(path, std::move(key_prefix));
+    [[maybe_unused]] auto result = path_map.emplace(path, std::move(key_prefix));
+    chassert(result.second);
 
     writeString(path.string(), *buf);
     buf->finalize();
@@ -60,14 +61,14 @@ void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::execute(std:
 
 void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::undo()
 {
+    auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
     if (write_finalized)
-        path_map.erase(path);
-
-    if (write_created)
     {
-        auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
+        path_map.erase(path);
         object_storage->removeObject(StoredObject(object_key.serialize(), path / PREFIX_PATH_FILE_NAME));
     }
+    else if (write_created)
+        object_storage->removeObjectIfExists(StoredObject(object_key.serialize(), path / PREFIX_PATH_FILE_NAME));
 }
 
 MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::MetadataStorageFromPlainObjectStorageMoveDirectoryOperation(
@@ -79,31 +80,31 @@ MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::MetadataStorageFrom
 {
 }
 
-std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::moveObject(
-    const std::filesystem::path & from, const std::filesystem::path & to, bool validate_content)
+std::unique_ptr<WriteBufferFromFileBase> MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::createWriteBuf(
+    const std::filesystem::path & expected_path, const std::filesystem::path & new_path, bool validate_content)
 {
-    auto from_it = path_map.find(from);
-    if (from_it == path_map.end())
-        throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Metadata object for the source path '{}' does not exist", from);
+    auto expected_it = path_map.find(expected_path);
+    if (expected_it == path_map.end())
+        throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Metadata object for the expected (source) path '{}' does not exist", expected_path);
 
-    if (path_map.contains(to))
-        throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Metadata object for the destination path '{}' already exists", to);
+    if (path_map.contains(new_path))
+        throw Exception(ErrorCodes::FILE_ALREADY_EXISTS, "Metadata object for the new (destination) path '{}' already exists", new_path);
 
-    auto object_key = ObjectStorageKey::createAsRelative(from_it->second, PREFIX_PATH_FILE_NAME);
+    auto object_key = ObjectStorageKey::createAsRelative(expected_it->second, PREFIX_PATH_FILE_NAME);
 
-    auto object = StoredObject(object_key.serialize(), path_from / PREFIX_PATH_FILE_NAME);
+    auto object = StoredObject(object_key.serialize(), expected_path / PREFIX_PATH_FILE_NAME);
 
     if (validate_content)
     {
         std::string data;
-        auto readBuf = object_storage->readObject(object);
-        readStringUntilEOF(data, *readBuf);
+        auto read_buf = object_storage->readObject(object);
+        readStringUntilEOF(data, *read_buf);
         if (data != path_from)
             throw Exception(
                 ErrorCodes::INCORRECT_DATA,
                 "Incorrect data for object key {}, expected {}, got {}",
                 object_key.serialize(),
-                path_from,
+                expected_path,
                 data);
     }
 
@@ -122,12 +123,14 @@ void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::execute(std::u
     LOG_TRACE(
         getLogger("MetadataStorageFromPlainObjectStorageMoveDirectoryOperation"), "Moving directory '{}' to '{}'", path_from, path_to);
 
-    auto write_buf = moveObject(path_from, path_to, /* validate_content */ true);
+    auto write_buf = createWriteBuf(path_from, path_to, /* validate_content */ true);
     write_created = true;
     writeString(path_to.string(), *write_buf);
     write_buf->finalize();
 
-    path_map.emplace(path_to, path_map.extract(path_from).mapped());
+    [[maybe_unused]] auto result = path_map.emplace(path_to, path_map.extract(path_from).mapped());
+    chassert(result.second);
+
     write_finalized = true;
 }
 
@@ -138,7 +141,7 @@ void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::undo()
 
     if (write_created)
     {
-        auto write_buf = moveObject(path_to, path_from, /* verify_content */ false);
+        auto write_buf = createWriteBuf(path_to, path_from, /* verify_content */ false);
         writeString(path_from.string(), *write_buf);
         write_buf->finalize();
     }
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
index d2cbbf5df2a..0f9ef07568f 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
@@ -44,7 +44,7 @@ private:
     bool write_finalized = false;
 
     std::unique_ptr<WriteBufferFromFileBase>
-    moveObject(const std::filesystem::path & from, const std::filesystem::path & to, bool validate_content);
+    createWriteBuf(const std::filesystem::path & expected_path, const std::filesystem::path & new_path, bool validate_content);
 
 public:
     MetadataStorageFromPlainObjectStorageMoveDirectoryOperation(
diff --git a/tests/integration/test_s3_plain_rewritable/test.py b/tests/integration/test_s3_plain_rewritable/test.py
index ac35da01897..5e27a690f1f 100644
--- a/tests/integration/test_s3_plain_rewritable/test.py
+++ b/tests/integration/test_s3_plain_rewritable/test.py
@@ -30,6 +30,7 @@ def start_cluster():
         cluster.shutdown()
 
 
+@pytest.mark.order(0)
 def test_insert():
     for index, value in enumerate(insert_values):
         node.query(
@@ -52,6 +53,7 @@ def test_insert():
         )
 
 
+@pytest.mark.order(1)
 def test_restart():
     for index, value in enumerate(insert_values):
         assert (
@@ -67,6 +69,7 @@ def test_restart():
         )
 
 
+@pytest.mark.order(2)
 def test_drop():
     for index, value in enumerate(insert_values):
         node.query("DROP TABLE IF EXISTS test_{} SYNC".format(index))

From 4a7f28f6bd95e198d3efcfb37b08e8785f1f72af Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Fri, 26 Apr 2024 01:34:14 +0000
Subject: [PATCH 111/192] address feedback - pt.2

---
 src/Disks/ObjectStorages/IObjectStorage.h                   | 4 +++-
 .../MetadataStorageFromPlainObjectStorage.cpp               | 2 +-
 src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp             | 6 ++++--
 src/Disks/ObjectStorages/S3/S3ObjectStorage.h               | 2 +-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h
index e2f041e3715..eae31af9d44 100644
--- a/src/Disks/ObjectStorages/IObjectStorage.h
+++ b/src/Disks/ObjectStorages/IObjectStorage.h
@@ -210,7 +210,9 @@ public:
     /// Generate blob name for passed absolute local path.
     /// Path can be generated either independently or based on `path`.
     virtual ObjectStorageKey generateObjectKeyForPath(const std::string & path) const = 0;
-    virtual std::string generateObjectKeyPrefixForDirectoryPath(const std::string & /* path */) const
+
+    /// Object key prefix for local paths in the directory 'path'.
+    virtual ObjectStorageKey generateObjectKeyPrefixForDirectoryPath(const std::string & /* path */) const
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'generateObjectKeyPrefixForDirectoryPath' is not implemented");
     }
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index 54cc96effee..2491bacac81 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -274,7 +274,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std
         return;
 
     auto normalized_path = normalizeDirectoryPath(path);
-    auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(normalized_path);
+    auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(normalized_path).serialize();
     auto op = std::make_unique<MetadataStorageFromPlainObjectStorageCreateDirectoryOperation>(
         std::move(normalized_path), std::move(key_prefix), *metadata_storage.path_map, object_storage);
     addOperation(std::move(op));
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
index 78847242b04..2f0d93907ae 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@@ -1,4 +1,5 @@
 #include <Disks/ObjectStorages/S3/S3ObjectStorage.h>
+#include "Common/ObjectStorageKey.h"
 
 #if USE_AWS_S3
 
@@ -572,11 +573,12 @@ ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string & p
     return key_generator->generate(path, /* is_directory */ false);
 }
 
-std::string S3ObjectStorage::generateObjectKeyPrefixForDirectoryPath(const std::string & path) const
+ObjectStorageKey S3ObjectStorage::generateObjectKeyPrefixForDirectoryPath(const std::string & path) const
 {
     if (!key_generator)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Key generator is not set");
-    return key_generator->generate(path, /* is_directory */ true).serialize();
+
+    return key_generator->generate(path, /* is_directory */ true);
 }
 }
 
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
index 2d88195117f..ff66b00e47c 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
@@ -159,7 +159,7 @@ public:
     bool supportParallelWrite() const override { return true; }
 
     ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override;
-    std::string generateObjectKeyPrefixForDirectoryPath(const std::string & path) const override;
+    ObjectStorageKey generateObjectKeyPrefixForDirectoryPath(const std::string & path) const override;
 
     bool isReadOnly() const override { return s3_settings.get()->read_only; }
 

From 802ee27b1b86b340549b2149c67260bd6c56d32b Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Fri, 26 Apr 2024 03:49:07 +0000
Subject: [PATCH 112/192] address feedback - pt.3

non-functional changes
---
 programs/keeper/CMakeLists.txt                |  2 +-
 .../MetadataFromDiskTransactionState.cpp      | 23 -------------------
 .../MetadataOperationsHolder.cpp              | 16 ++++++-------
 .../ObjectStorages/MetadataOperationsHolder.h |  4 ++--
 .../ObjectStorages/MetadataStorageFromDisk.h  |  2 +-
 .../MetadataStorageFromPlainObjectStorage.h   |  2 +-
 .../MetadataStorageTransactionState.cpp       | 23 +++++++++++++++++++
 ...te.h => MetadataStorageTransactionState.h} |  5 ++--
 .../MetadataStorageFromStaticFilesWebServer.h |  6 ++---
 9 files changed, 41 insertions(+), 42 deletions(-)
 delete mode 100644 src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp
 create mode 100644 src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp
 rename src/Disks/ObjectStorages/{MetadataFromDiskTransactionState.h => MetadataStorageTransactionState.h} (53%)

diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt
index 88964465878..51529036ed5 100644
--- a/programs/keeper/CMakeLists.txt
+++ b/programs/keeper/CMakeLists.txt
@@ -125,7 +125,7 @@ if (BUILD_STANDALONE_KEEPER)
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorage.cpp
diff --git a/src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp b/src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp
deleted file mode 100644
index f6915370b10..00000000000
--- a/src/Disks/ObjectStorages/MetadataFromDiskTransactionState.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <base/defines.h>
-#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
-
-namespace DB
-{
-
-std::string toString(MetadataFromDiskTransactionState state)
-{
-    switch (state)
-    {
-        case MetadataFromDiskTransactionState::PREPARING:
-            return "PREPARING";
-        case MetadataFromDiskTransactionState::FAILED:
-            return "FAILED";
-        case MetadataFromDiskTransactionState::COMMITTED:
-            return "COMMITTED";
-        case MetadataFromDiskTransactionState::PARTIALLY_ROLLED_BACK:
-            return "PARTIALLY_ROLLED_BACK";
-    }
-    UNREACHABLE();
-}
-
-}
diff --git a/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp b/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
index f051f62fa41..3023700631c 100644
--- a/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
+++ b/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
@@ -13,7 +13,7 @@ extern const int FS_METADATA_ERROR;
 void MetadataOperationsHolder::rollback(size_t until_pos)
 {
     /// Otherwise everything is alright
-    if (state == MetadataFromDiskTransactionState::FAILED)
+    if (state == MetadataStorageTransactionState::FAILED)
     {
         for (int64_t i = until_pos; i >= 0; --i)
         {
@@ -23,7 +23,7 @@ void MetadataOperationsHolder::rollback(size_t until_pos)
             }
             catch (Exception & ex)
             {
-                state = MetadataFromDiskTransactionState::PARTIALLY_ROLLED_BACK;
+                state = MetadataStorageTransactionState::PARTIALLY_ROLLED_BACK;
                 ex.addMessage(fmt::format("While rolling back operation #{}", i));
                 throw;
             }
@@ -37,24 +37,24 @@ void MetadataOperationsHolder::rollback(size_t until_pos)
 
 void MetadataOperationsHolder::addOperation(MetadataOperationPtr && operation)
 {
-    if (state != MetadataFromDiskTransactionState::PREPARING)
+    if (state != MetadataStorageTransactionState::PREPARING)
         throw Exception(
             ErrorCodes::FS_METADATA_ERROR,
             "Cannot add operations to transaction in {} state, it should be in {} state",
             toString(state),
-            toString(MetadataFromDiskTransactionState::PREPARING));
+            toString(MetadataStorageTransactionState::PREPARING));
 
     operations.emplace_back(std::move(operation));
 }
 
 void MetadataOperationsHolder::commitImpl(SharedMutex & metadata_mutex)
 {
-    if (state != MetadataFromDiskTransactionState::PREPARING)
+    if (state != MetadataStorageTransactionState::PREPARING)
         throw Exception(
             ErrorCodes::FS_METADATA_ERROR,
             "Cannot commit transaction in {} state, it should be in {} state",
             toString(state),
-            toString(MetadataFromDiskTransactionState::PREPARING));
+            toString(MetadataStorageTransactionState::PREPARING));
 
     {
         std::unique_lock lock(metadata_mutex);
@@ -68,7 +68,7 @@ void MetadataOperationsHolder::commitImpl(SharedMutex & metadata_mutex)
             {
                 tryLogCurrentException(__PRETTY_FUNCTION__);
                 ex.addMessage(fmt::format("While committing metadata operation #{}", i));
-                state = MetadataFromDiskTransactionState::FAILED;
+                state = MetadataStorageTransactionState::FAILED;
                 rollback(i);
                 throw;
             }
@@ -88,7 +88,7 @@ void MetadataOperationsHolder::commitImpl(SharedMutex & metadata_mutex)
         }
     }
 
-    state = MetadataFromDiskTransactionState::COMMITTED;
+    state = MetadataStorageTransactionState::COMMITTED;
 }
 
 }
diff --git a/src/Disks/ObjectStorages/MetadataOperationsHolder.h b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
index e264df5e094..7c5988a70f8 100644
--- a/src/Disks/ObjectStorages/MetadataOperationsHolder.h
+++ b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
@@ -3,7 +3,7 @@
 #include <Disks/ObjectStorages/IMetadataOperation.h>
 
 /// TODO: rename to MetadataStorageTransactionState.
-#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
+#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 
 /**
  * Implementations for transactional operations with metadata used by MetadataStorageFromDisk
@@ -17,7 +17,7 @@ class MetadataOperationsHolder
 {
 private:
     std::vector<MetadataOperationPtr> operations;
-    MetadataFromDiskTransactionState state{MetadataFromDiskTransactionState::PREPARING};
+    MetadataStorageTransactionState state{MetadataStorageTransactionState::PREPARING};
 
     void rollback(size_t until_pos);
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
index 5dca40afc59..1346a4dcf93 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
@@ -5,7 +5,7 @@
 
 #include <Disks/IDisk.h>
 #include <Disks/ObjectStorages/DiskObjectStorageMetadata.h>
-#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
+#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 #include <Disks/ObjectStorages/MetadataOperationsHolder.h>
 #include <Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h>
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index 3820fd893b5..3f5d2f8b260 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -2,7 +2,7 @@
 
 #include <Disks/IDisk.h>
 #include <Disks/ObjectStorages/IMetadataStorage.h>
-#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
+#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 #include <Disks/ObjectStorages/MetadataOperationsHolder.h>
 
 #include <unordered_map>
diff --git a/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp b/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp
new file mode 100644
index 00000000000..245578b5d9e
--- /dev/null
+++ b/src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp
@@ -0,0 +1,23 @@
+#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
+#include <base/defines.h>
+
+namespace DB
+{
+
+std::string toString(MetadataStorageTransactionState state)
+{
+    switch (state)
+    {
+        case MetadataStorageTransactionState::PREPARING:
+            return "PREPARING";
+        case MetadataStorageTransactionState::FAILED:
+            return "FAILED";
+        case MetadataStorageTransactionState::COMMITTED:
+            return "COMMITTED";
+        case MetadataStorageTransactionState::PARTIALLY_ROLLED_BACK:
+            return "PARTIALLY_ROLLED_BACK";
+    }
+    UNREACHABLE();
+}
+
+}
diff --git a/src/Disks/ObjectStorages/MetadataFromDiskTransactionState.h b/src/Disks/ObjectStorages/MetadataStorageTransactionState.h
similarity index 53%
rename from src/Disks/ObjectStorages/MetadataFromDiskTransactionState.h
rename to src/Disks/ObjectStorages/MetadataStorageTransactionState.h
index 3dc4c610e3a..fb05d185a37 100644
--- a/src/Disks/ObjectStorages/MetadataFromDiskTransactionState.h
+++ b/src/Disks/ObjectStorages/MetadataStorageTransactionState.h
@@ -4,7 +4,7 @@
 namespace DB
 {
 
-enum class MetadataFromDiskTransactionState
+enum class MetadataStorageTransactionState
 {
     PREPARING,
     FAILED,
@@ -12,6 +12,5 @@ enum class MetadataFromDiskTransactionState
     PARTIALLY_ROLLED_BACK,
 };
 
-std::string toString(MetadataFromDiskTransactionState state);
-
+std::string toString(MetadataStorageTransactionState state);
 }
diff --git a/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h b/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h
index b720a9c91f3..35271d7192c 100644
--- a/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h
+++ b/src/Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h
@@ -1,9 +1,9 @@
 #pragma once
 
-#include <Disks/ObjectStorages/IMetadataStorage.h>
-#include <Disks/ObjectStorages/MetadataFromDiskTransactionState.h>
-#include <Disks/ObjectStorages/Web/WebObjectStorage.h>
 #include <Disks/IDisk.h>
+#include <Disks/ObjectStorages/IMetadataStorage.h>
+#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
+#include <Disks/ObjectStorages/Web/WebObjectStorage.h>
 
 
 namespace DB

From d1217af3895190cd5314da014e8d7eb4c76a91b3 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Fri, 26 Apr 2024 04:25:35 +0000
Subject: [PATCH 113/192] address feedback - pt.4

---
 src/Disks/ObjectStorages/IMetadataOperation.h |  3 +-
 .../MetadataOperationsHolder.cpp              |  7 ++--
 .../ObjectStorages/MetadataOperationsHolder.h |  6 +--
 .../ObjectStorages/MetadataStorageFromDisk.h  |  2 +-
 ...taStorageFromDiskTransactionOperations.cpp | 40 +++++++++----------
 ...dataStorageFromDiskTransactionOperations.h | 31 +++++++-------
 .../MetadataStorageFromPlainObjectStorage.h   |  2 +-
 ...torageFromPlainObjectStorageOperations.cpp |  6 +--
 ...aStorageFromPlainObjectStorageOperations.h |  6 +--
 9 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/src/Disks/ObjectStorages/IMetadataOperation.h b/src/Disks/ObjectStorages/IMetadataOperation.h
index e4b241ad388..c4f62c6f0bf 100644
--- a/src/Disks/ObjectStorages/IMetadataOperation.h
+++ b/src/Disks/ObjectStorages/IMetadataOperation.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <mutex>
 #include <Common/SharedMutex.h>
 
 namespace DB
@@ -8,7 +9,7 @@ namespace DB
 struct IMetadataOperation
 {
     virtual void execute(std::unique_lock<SharedMutex> & metadata_lock) = 0;
-    virtual void undo() = 0;
+    virtual void undo(std::unique_lock<SharedMutex> & metadata_lock) = 0;
     virtual void finalize() { }
     virtual ~IMetadataOperation() = default;
 };
diff --git a/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp b/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
index 3023700631c..4bd81689550 100644
--- a/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
+++ b/src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
@@ -10,7 +10,7 @@ namespace ErrorCodes
 extern const int FS_METADATA_ERROR;
 }
 
-void MetadataOperationsHolder::rollback(size_t until_pos)
+void MetadataOperationsHolder::rollback(std::unique_lock<SharedMutex> & lock, size_t until_pos)
 {
     /// Otherwise everything is alright
     if (state == MetadataStorageTransactionState::FAILED)
@@ -19,7 +19,7 @@ void MetadataOperationsHolder::rollback(size_t until_pos)
         {
             try
             {
-                operations[i]->undo();
+                operations[i]->undo(lock);
             }
             catch (Exception & ex)
             {
@@ -69,7 +69,7 @@ void MetadataOperationsHolder::commitImpl(SharedMutex & metadata_mutex)
                 tryLogCurrentException(__PRETTY_FUNCTION__);
                 ex.addMessage(fmt::format("While committing metadata operation #{}", i));
                 state = MetadataStorageTransactionState::FAILED;
-                rollback(i);
+                rollback(lock, i);
                 throw;
             }
         }
@@ -90,5 +90,4 @@ void MetadataOperationsHolder::commitImpl(SharedMutex & metadata_mutex)
 
     state = MetadataStorageTransactionState::COMMITTED;
 }
-
 }
diff --git a/src/Disks/ObjectStorages/MetadataOperationsHolder.h b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
index 7c5988a70f8..8997f40b9a2 100644
--- a/src/Disks/ObjectStorages/MetadataOperationsHolder.h
+++ b/src/Disks/ObjectStorages/MetadataOperationsHolder.h
@@ -1,9 +1,9 @@
 #pragma once
 
+#include <mutex>
 #include <Disks/ObjectStorages/IMetadataOperation.h>
-
-/// TODO: rename to MetadataStorageTransactionState.
 #include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
+#include <Common/SharedMutex.h>
 
 /**
  * Implementations for transactional operations with metadata used by MetadataStorageFromDisk
@@ -19,7 +19,7 @@ private:
     std::vector<MetadataOperationPtr> operations;
     MetadataStorageTransactionState state{MetadataStorageTransactionState::PREPARING};
 
-    void rollback(size_t until_pos);
+    void rollback(std::unique_lock<SharedMutex> & lock, size_t until_pos);
 
 protected:
     void addOperation(MetadataOperationPtr && operation);
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
index 1346a4dcf93..df16bf76a3c 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
@@ -5,9 +5,9 @@
 
 #include <Disks/IDisk.h>
 #include <Disks/ObjectStorages/DiskObjectStorageMetadata.h>
-#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 #include <Disks/ObjectStorages/MetadataOperationsHolder.h>
 #include <Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h>
+#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 
 namespace DB
 {
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp
index 1357acdfc66..194a735f64f 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.cpp
@@ -32,7 +32,7 @@ void SetLastModifiedOperation::execute(std::unique_lock<SharedMutex> &)
     disk.setLastModified(path, new_timestamp);
 }
 
-void SetLastModifiedOperation::undo()
+void SetLastModifiedOperation::undo(std::unique_lock<SharedMutex> &)
 {
     disk.setLastModified(path, old_timestamp);
 }
@@ -50,7 +50,7 @@ void ChmodOperation::execute(std::unique_lock<SharedMutex> &)
     disk.chmod(path, mode);
 }
 
-void ChmodOperation::undo()
+void ChmodOperation::undo(std::unique_lock<SharedMutex> &)
 {
     disk.chmod(path, old_mode);
 }
@@ -68,7 +68,7 @@ void UnlinkFileOperation::execute(std::unique_lock<SharedMutex> &)
     disk.removeFile(path);
 }
 
-void UnlinkFileOperation::undo()
+void UnlinkFileOperation::undo(std::unique_lock<SharedMutex> &)
 {
     auto buf = disk.writeFile(path);
     writeString(prev_data, *buf);
@@ -86,7 +86,7 @@ void CreateDirectoryOperation::execute(std::unique_lock<SharedMutex> &)
     disk.createDirectory(path);
 }
 
-void CreateDirectoryOperation::undo()
+void CreateDirectoryOperation::undo(std::unique_lock<SharedMutex> &)
 {
     disk.removeDirectory(path);
 }
@@ -112,7 +112,7 @@ void CreateDirectoryRecursiveOperation::execute(std::unique_lock<SharedMutex> &)
         disk.createDirectory(path_to_create);
 }
 
-void CreateDirectoryRecursiveOperation::undo()
+void CreateDirectoryRecursiveOperation::undo(std::unique_lock<SharedMutex> &)
 {
     for (const auto & path_created : paths_created)
         disk.removeDirectory(path_created);
@@ -129,7 +129,7 @@ void RemoveDirectoryOperation::execute(std::unique_lock<SharedMutex> &)
     disk.removeDirectory(path);
 }
 
-void RemoveDirectoryOperation::undo()
+void RemoveDirectoryOperation::undo(std::unique_lock<SharedMutex> &)
 {
     disk.createDirectory(path);
 }
@@ -149,7 +149,7 @@ void RemoveRecursiveOperation::execute(std::unique_lock<SharedMutex> &)
         disk.moveDirectory(path, temp_path);
 }
 
-void RemoveRecursiveOperation::undo()
+void RemoveRecursiveOperation::undo(std::unique_lock<SharedMutex> &)
 {
     if (disk.isFile(temp_path))
         disk.moveFile(temp_path, path);
@@ -187,10 +187,10 @@ void CreateHardlinkOperation::execute(std::unique_lock<SharedMutex> & lock)
     disk.createHardLink(path_from, path_to);
 }
 
-void CreateHardlinkOperation::undo()
+void CreateHardlinkOperation::undo(std::unique_lock<SharedMutex> & lock)
 {
     if (write_operation)
-        write_operation->undo();
+        write_operation->undo(lock);
     disk.removeFile(path_to);
 }
 
@@ -206,7 +206,7 @@ void MoveFileOperation::execute(std::unique_lock<SharedMutex> &)
     disk.moveFile(path_from, path_to);
 }
 
-void MoveFileOperation::undo()
+void MoveFileOperation::undo(std::unique_lock<SharedMutex> &)
 {
     disk.moveFile(path_to, path_from);
 }
@@ -223,7 +223,7 @@ void MoveDirectoryOperation::execute(std::unique_lock<SharedMutex> &)
     disk.moveDirectory(path_from, path_to);
 }
 
-void MoveDirectoryOperation::undo()
+void MoveDirectoryOperation::undo(std::unique_lock<SharedMutex> &)
 {
     disk.moveDirectory(path_to, path_from);
 }
@@ -244,7 +244,7 @@ void ReplaceFileOperation::execute(std::unique_lock<SharedMutex> &)
     disk.replaceFile(path_from, path_to);
 }
 
-void ReplaceFileOperation::undo()
+void ReplaceFileOperation::undo(std::unique_lock<SharedMutex> &)
 {
     disk.moveFile(path_to, path_from);
     disk.moveFile(temp_path_to, path_to);
@@ -275,7 +275,7 @@ void WriteFileOperation::execute(std::unique_lock<SharedMutex> &)
     buf->finalize();
 }
 
-void WriteFileOperation::undo()
+void WriteFileOperation::undo(std::unique_lock<SharedMutex> &)
 {
     if (!existed)
     {
@@ -303,10 +303,10 @@ void AddBlobOperation::execute(std::unique_lock<SharedMutex> & metadata_lock)
     write_operation->execute(metadata_lock);
 }
 
-void AddBlobOperation::undo()
+void AddBlobOperation::undo(std::unique_lock<SharedMutex> & lock)
 {
     if (write_operation)
-        write_operation->undo();
+        write_operation->undo(lock);
 }
 
 void UnlinkMetadataFileOperation::execute(std::unique_lock<SharedMutex> & metadata_lock)
@@ -325,17 +325,17 @@ void UnlinkMetadataFileOperation::execute(std::unique_lock<SharedMutex> & metada
     unlink_operation->execute(metadata_lock);
 }
 
-void UnlinkMetadataFileOperation::undo()
+void UnlinkMetadataFileOperation::undo(std::unique_lock<SharedMutex> & lock)
 {
     /// Operations MUST be reverted in the reversed order, so
     /// when we apply operation #1 (write) and operation #2 (unlink)
     /// we should revert #2 and only after it #1. Otherwise #1 will overwrite
     /// file with incorrect data.
     if (unlink_operation)
-        unlink_operation->undo();
+        unlink_operation->undo(lock);
 
     if (write_operation)
-        write_operation->undo();
+        write_operation->undo(lock);
 
     /// Update outcome to reflect the fact that we have restored the file.
     outcome->num_hardlinks++;
@@ -349,10 +349,10 @@ void SetReadonlyFileOperation::execute(std::unique_lock<SharedMutex> & metadata_
     write_operation->execute(metadata_lock);
 }
 
-void SetReadonlyFileOperation::undo()
+void SetReadonlyFileOperation::undo(std::unique_lock<SharedMutex> & lock)
 {
     if (write_operation)
-        write_operation->undo();
+        write_operation->undo(lock);
 }
 
 }
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h
index 4c5fa0f6e78..3df29833f44 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDiskTransactionOperations.h
@@ -20,7 +20,7 @@ struct SetLastModifiedOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -35,7 +35,7 @@ struct ChmodOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -51,7 +51,7 @@ struct UnlinkFileOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -66,7 +66,7 @@ struct CreateDirectoryOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -80,7 +80,7 @@ struct CreateDirectoryRecursiveOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -95,7 +95,7 @@ struct RemoveDirectoryOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -108,7 +108,7 @@ struct RemoveRecursiveOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
     void finalize() override;
 
@@ -124,7 +124,8 @@ struct WriteFileOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
+
 private:
     std::string path;
     IDisk & disk;
@@ -143,7 +144,7 @@ struct CreateHardlinkOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path_from;
@@ -160,7 +161,7 @@ struct MoveFileOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path_from;
@@ -175,7 +176,7 @@ struct MoveDirectoryOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path_from;
@@ -190,7 +191,7 @@ struct ReplaceFileOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
     void finalize() override;
 
@@ -218,7 +219,7 @@ struct AddBlobOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -246,7 +247,7 @@ struct UnlinkMetadataFileOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
@@ -271,7 +272,7 @@ struct SetReadonlyFileOperation final : public IMetadataOperation
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 
 private:
     std::string path;
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index 3f5d2f8b260..5e6a5b13ec4 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -2,8 +2,8 @@
 
 #include <Disks/IDisk.h>
 #include <Disks/ObjectStorages/IMetadataStorage.h>
-#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 #include <Disks/ObjectStorages/MetadataOperationsHolder.h>
+#include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 
 #include <unordered_map>
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
index c3fa4ef1ec5..a28f4e7a882 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
@@ -59,7 +59,7 @@ void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::execute(std:
     write_finalized = true;
 }
 
-void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::undo()
+void MetadataStorageFromPlainObjectStorageCreateDirectoryOperation::undo(std::unique_lock<SharedMutex> &)
 {
     auto object_key = ObjectStorageKey::createAsRelative(key_prefix, PREFIX_PATH_FILE_NAME);
     if (write_finalized)
@@ -134,7 +134,7 @@ void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::execute(std::u
     write_finalized = true;
 }
 
-void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::undo()
+void MetadataStorageFromPlainObjectStorageMoveDirectoryOperation::undo(std::unique_lock<SharedMutex> &)
 {
     if (write_finalized)
         path_map.emplace(path_from, path_map.extract(path_to).mapped());
@@ -168,7 +168,7 @@ void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::execute(std:
     path_map.erase(path_it);
 }
 
-void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::undo()
+void MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation::undo(std::unique_lock<SharedMutex> &)
 {
     if (!removed)
         return;
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
index 0f9ef07568f..4b196f787fd 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h
@@ -29,7 +29,7 @@ public:
         ObjectStoragePtr object_storage_);
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 };
 
 class MetadataStorageFromPlainObjectStorageMoveDirectoryOperation final : public IMetadataOperation
@@ -55,7 +55,7 @@ public:
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
 
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 };
 
 class MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation final : public IMetadataOperation
@@ -74,7 +74,7 @@ public:
         std::filesystem::path && path_, MetadataStorageFromPlainObjectStorage::PathMap & path_map_, ObjectStoragePtr object_storage_);
 
     void execute(std::unique_lock<SharedMutex> & metadata_lock) override;
-    void undo() override;
+    void undo(std::unique_lock<SharedMutex> & metadata_lock) override;
 };
 
 }

From 24d5abba10483b1cfd86bce078b7c79a9ab47657 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Fri, 26 Apr 2024 20:51:25 +0000
Subject: [PATCH 114/192] extract plain_rewritable metadata type

Make PlainRewritableObjectStorage generic;

Support config type
```
<s3_plain_rewritable>
    <type>object_storage</type>
    <object_storage_type>s3</object_storage_type>
    <metadata_type>plain_rewritable</metadata_type>
    <endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
    <use_environment_credentials>1</use_environment_credentials>
</s3_plain_rewritable>
```
---
 programs/keeper/CMakeLists.txt                |   1 +
 src/Disks/DiskType.cpp                        |   2 +
 src/Disks/DiskType.h                          |   1 +
 src/Disks/ObjectStorages/IMetadataStorage.h   |   2 +-
 .../ObjectStorages/MetadataStorageFactory.cpp |  16 ++
 .../MetadataStorageFromPlainObjectStorage.cpp | 165 +++---------------
 .../MetadataStorageFromPlainObjectStorage.h   |  10 +-
 ...torageFromPlainRewritableObjectStorage.cpp | 141 +++++++++++++++
 ...aStorageFromPlainRewritableObjectStorage.h |  26 +++
 .../ObjectStorages/ObjectStorageFactory.cpp   | 113 ++++++------
 .../PlainRewritableObjectStorage.h            |  24 +++
 .../RegisterDiskObjectStorage.cpp             |   7 +-
 .../S3/S3PlainRewritableObjectStorage.h       |  23 ---
 .../0_stateless/03008_s3_plain_rewritable.sql |  11 +-
 14 files changed, 313 insertions(+), 229 deletions(-)
 create mode 100644 src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp
 create mode 100644 src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.h
 create mode 100644 src/Disks/ObjectStorages/PlainRewritableObjectStorage.h
 delete mode 100644 src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h

diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt
index 51529036ed5..b811868333b 100644
--- a/programs/keeper/CMakeLists.txt
+++ b/programs/keeper/CMakeLists.txt
@@ -124,6 +124,7 @@ if (BUILD_STANDALONE_KEEPER)
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataOperationsHolder.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/MetadataStorageTransactionState.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
diff --git a/src/Disks/DiskType.cpp b/src/Disks/DiskType.cpp
index 1778ae8025b..448e173a30f 100644
--- a/src/Disks/DiskType.cpp
+++ b/src/Disks/DiskType.cpp
@@ -16,6 +16,8 @@ MetadataStorageType metadataTypeFromString(const String & type)
         return MetadataStorageType::Local;
     if (check_type == "plain")
         return MetadataStorageType::Plain;
+    if (check_type == "plain_rewritable")
+        return MetadataStorageType::PlainRewritable;
     if (check_type == "web")
         return MetadataStorageType::StaticWeb;
 
diff --git a/src/Disks/DiskType.h b/src/Disks/DiskType.h
index 36fe4d83004..8659f396270 100644
--- a/src/Disks/DiskType.h
+++ b/src/Disks/DiskType.h
@@ -28,6 +28,7 @@ enum class MetadataStorageType
     None,
     Local,
     Plain,
+    PlainRewritable,
     StaticWeb,
 };
 
diff --git a/src/Disks/ObjectStorages/IMetadataStorage.h b/src/Disks/ObjectStorages/IMetadataStorage.h
index af026bdb095..168160f61a6 100644
--- a/src/Disks/ObjectStorages/IMetadataStorage.h
+++ b/src/Disks/ObjectStorages/IMetadataStorage.h
@@ -229,7 +229,7 @@ public:
     /// object_storage_path is absolute.
     virtual StoredObjects getStorageObjects(const std::string & path) const = 0;
 
-private:
+protected:
     [[noreturn]] static void throwNotImplemented()
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Operation is not implemented");
diff --git a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp
index adc1f84372c..4a3e8a37d28 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp
@@ -1,6 +1,7 @@
 #include <Disks/ObjectStorages/MetadataStorageFactory.h>
 #include <Disks/ObjectStorages/MetadataStorageFromDisk.h>
 #include <Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h>
+#include <Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.h>
 #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD
 #include <Disks/ObjectStorages/Web/MetadataStorageFromStaticFilesWebServer.h>
 #endif
@@ -118,6 +119,20 @@ void registerPlainMetadataStorage(MetadataStorageFactory & factory)
     });
 }
 
+void registerPlainRewritableMetadataStorage(MetadataStorageFactory & factory)
+{
+    factory.registerMetadataStorageType(
+        "plain_rewritable",
+        [](const std::string & /* name */,
+           const Poco::Util::AbstractConfiguration & config,
+           const std::string & config_prefix,
+           ObjectStoragePtr object_storage) -> MetadataStoragePtr
+        {
+            auto key_compatibility_prefix = getObjectKeyCompatiblePrefix(*object_storage, config, config_prefix);
+            return std::make_shared<MetadataStorageFromPlainRewritableObjectStorage>(object_storage, key_compatibility_prefix);
+        });
+}
+
 #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD
 void registerMetadataStorageFromStaticFilesWebServer(MetadataStorageFactory & factory)
 {
@@ -137,6 +152,7 @@ void registerMetadataStorages()
     auto & factory = MetadataStorageFactory::instance();
     registerMetadataStorageFromDisk(factory);
     registerPlainMetadataStorage(factory);
+    registerPlainRewritableMetadataStorage(factory);
 #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD
     registerMetadataStorageFromStaticFilesWebServer(factory);
 #endif
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index 2491bacac81..963d3059b5b 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -3,15 +3,9 @@
 #include <Disks/ObjectStorages/MetadataStorageFromPlainObjectStorageOperations.h>
 #include <Disks/ObjectStorages/StaticDirectoryIterator.h>
 
-#include <IO/ReadHelpers.h>
-#include <Common/StringUtils/StringUtils.h>
 #include <Common/filesystemHelpers.h>
-#include <Common/getRandomASCIIString.h>
-#include <Common/logger_useful.h>
-#include "CommonPathPrefixKeyGenerator.h"
 
 #include <filesystem>
-#include <ranges>
 #include <tuple>
 
 namespace DB
@@ -20,8 +14,6 @@ namespace DB
 namespace
 {
 
-constexpr auto PREFIX_PATH_FILE_NAME = "prefix.path";
-
 std::filesystem::path normalizePath(const std::filesystem::path & path)
 {
     return std::filesystem::path(path).lexically_normal();
@@ -32,56 +24,14 @@ std::filesystem::path normalizeDirectoryPath(const std::filesystem::path & path)
     return normalizePath(path) / "";
 }
 
-MetadataStorageFromPlainObjectStorage::PathMap loadPathPrefixMap(const std::string & root, ObjectStoragePtr object_storage)
-{
-    MetadataStorageFromPlainObjectStorage::PathMap result;
-
-    RelativePathsWithMetadata files;
-    object_storage->listObjects(root, files, 0);
-    for (const auto & file : files)
-    {
-        auto remote_path = std::filesystem::path(file.relative_path);
-        if (remote_path.filename() != PREFIX_PATH_FILE_NAME)
-            continue;
-
-        StoredObject object{file.relative_path};
-
-        auto read_buf = object_storage->readObject(object);
-        String local_path;
-        readStringUntilEOF(local_path, *read_buf);
-
-        chassert(remote_path.has_parent_path());
-        auto res = result.emplace(local_path, remote_path.parent_path());
-
-        /// This can happen if table replication is enabled, then the same local path is written
-        /// in `prefix.path` of each replica.
-        /// TODO: should replicated tables (e.g., RMT) be explicitly disallowed?
-        if (!res.second)
-            LOG_WARNING(
-                getLogger("MetadataStorageFromPlainObjectStorage"),
-                "The local path '{}' is already mapped to a remote path '{}', ignoring: '{}'",
-                local_path,
-                res.first->second,
-                remote_path.parent_path().string());
-    }
-    return result;
-}
-
 }
 
 MetadataStorageFromPlainObjectStorage::MetadataStorageFromPlainObjectStorage(ObjectStoragePtr object_storage_, String storage_path_prefix_)
     : object_storage(object_storage_)
     , storage_path_prefix(std::move(storage_path_prefix_))
-    , path_map(std::make_shared<PathMap>(loadPathPrefixMap(object_storage->getCommonKeyPrefix(), object_storage)))
 {
-    if (!object_storage->isWriteOnce())
-    {
-        auto keys_gen = std::make_shared<CommonPathPrefixKeyGenerator>(object_storage->getCommonKeyPrefix(), metadata_mutex, path_map);
-        object_storage->setKeysGenerator(keys_gen);
-    }
 }
 
-
 MetadataTransactionPtr MetadataStorageFromPlainObjectStorage::createTransaction()
 {
     return std::make_shared<MetadataStorageFromPlainObjectStorageTransaction>(*this, object_storage);
@@ -123,90 +73,6 @@ uint64_t MetadataStorageFromPlainObjectStorage::getFileSize(const String & path)
     return 0;
 }
 
-namespace
-{
-
-std::vector<std::string> getDirectChildrenOnWriteOnceDisk(const std::string & storage_key, const RelativePathsWithMetadata & remote_paths)
-{
-    std::unordered_set<std::string> duplicates_filter;
-    for (const auto & elem : remote_paths)
-    {
-        const auto & path = elem.relative_path;
-        chassert(path.find(storage_key) == 0);
-        const auto child_pos = storage_key.size();
-        /// string::npos is ok.
-        const auto slash_pos = path.find('/', child_pos);
-        if (slash_pos == std::string::npos)
-            duplicates_filter.emplace(path.substr(child_pos));
-        else
-            duplicates_filter.emplace(path.substr(child_pos, slash_pos - child_pos));
-    }
-    return std::vector<std::string>(std::make_move_iterator(duplicates_filter.begin()), std::make_move_iterator(duplicates_filter.end()));
-}
-
-std::vector<std::string> getDirectChildrenOnRewritableDisk(
-    const std::string & storage_key,
-    const RelativePathsWithMetadata & remote_paths,
-    const std::string & local_path,
-    const MetadataStorageFromPlainObjectStorage::PathMap & local_path_prefixes,
-    SharedMutex & shared_mutex)
-{
-    using PathMap = MetadataStorageFromPlainObjectStorage::PathMap;
-
-    std::unordered_set<std::string> duplicates_filter;
-
-    /// Map remote paths into local subdirectories.
-    std::unordered_map<PathMap::mapped_type, PathMap::key_type> remote_to_local_subdir;
-
-    {
-        std::shared_lock lock(shared_mutex);
-        for (const auto & [k, v] : local_path_prefixes)
-        {
-            if (!k.starts_with(local_path))
-                continue;
-
-            auto slash_num = count(k.begin() + local_path.size(), k.end(), '/');
-            if (slash_num != 1)
-                continue;
-
-            chassert(k.back() == '/');
-            remote_to_local_subdir.emplace(v, std::string(k.begin() + local_path.size(), k.end() - 1));
-        }
-    }
-
-    auto skip_list = std::set<std::string>{PREFIX_PATH_FILE_NAME};
-    for (const auto & elem : remote_paths)
-    {
-        const auto & path = elem.relative_path;
-        chassert(path.find(storage_key) == 0);
-        const auto child_pos = storage_key.size();
-
-        auto slash_pos = path.find('/', child_pos);
-
-        if (slash_pos == std::string::npos)
-        {
-            /// File names.
-            auto filename = path.substr(child_pos);
-            if (!skip_list.contains(filename))
-                duplicates_filter.emplace(std::move(filename));
-        }
-        else
-        {
-            /// Subdirectories.
-            auto it = remote_to_local_subdir.find(path.substr(0, slash_pos));
-            /// Mapped subdirectories.
-            if (it != remote_to_local_subdir.end())
-                duplicates_filter.emplace(it->second);
-            /// The remote subdirectory name is the same as the local subdirectory.
-            else
-                duplicates_filter.emplace(path.substr(child_pos, slash_pos - child_pos));
-        }
-    }
-
-    return std::vector<std::string>(std::make_move_iterator(duplicates_filter.begin()), std::make_move_iterator(duplicates_filter.end()));
-}
-}
-
 std::vector<std::string> MetadataStorageFromPlainObjectStorage::listDirectory(const std::string & path) const
 {
     auto key_prefix = object_storage->generateObjectKeyForPath(path).serialize();
@@ -218,13 +84,9 @@ std::vector<std::string> MetadataStorageFromPlainObjectStorage::listDirectory(co
 
     object_storage->listObjects(abs_key, files, 0);
 
-    if (object_storage->isWriteOnce())
-        return getDirectChildrenOnWriteOnceDisk(abs_key, files);
-    else
-        return getDirectChildrenOnRewritableDisk(abs_key, files, path, *path_map, metadata_mutex);
+    return getDirectChildrenOnDisk(abs_key, files, path);
 }
 
-
 DirectoryIteratorPtr MetadataStorageFromPlainObjectStorage::iterateDirectory(const std::string & path) const
 {
     /// Required for MergeTree
@@ -242,6 +104,25 @@ StoredObjects MetadataStorageFromPlainObjectStorage::getStorageObjects(const std
     return {StoredObject(object_key.serialize(), path, object_size)};
 }
 
+std::vector<std::string> MetadataStorageFromPlainObjectStorage::getDirectChildrenOnDisk(
+    const std::string & storage_key, const RelativePathsWithMetadata & remote_paths, const std::string & /* local_path */) const
+{
+    std::unordered_set<std::string> duplicates_filter;
+    for (const auto & elem : remote_paths)
+    {
+        const auto & path = elem.relative_path;
+        chassert(path.find(storage_key) == 0);
+        const auto child_pos = storage_key.size();
+        /// string::npos is ok.
+        const auto slash_pos = path.find('/', child_pos);
+        if (slash_pos == std::string::npos)
+            duplicates_filter.emplace(path.substr(child_pos));
+        else
+            duplicates_filter.emplace(path.substr(child_pos, slash_pos - child_pos));
+    }
+    return std::vector<std::string>(std::make_move_iterator(duplicates_filter.begin()), std::make_move_iterator(duplicates_filter.end()));
+}
+
 const IMetadataStorage & MetadataStorageFromPlainObjectStorageTransaction::getStorageForNonTransactionalReads() const
 {
     return metadata_storage;
@@ -264,7 +145,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::removeDirectory(const std
     else
     {
         addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageRemoveDirectoryOperation>(
-            normalizeDirectoryPath(path), *metadata_storage.path_map, object_storage));
+            normalizeDirectoryPath(path), *metadata_storage.getPathMap(), object_storage));
     }
 }
 
@@ -276,7 +157,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::createDirectory(const std
     auto normalized_path = normalizeDirectoryPath(path);
     auto key_prefix = object_storage->generateObjectKeyPrefixForDirectoryPath(normalized_path).serialize();
     auto op = std::make_unique<MetadataStorageFromPlainObjectStorageCreateDirectoryOperation>(
-        std::move(normalized_path), std::move(key_prefix), *metadata_storage.path_map, object_storage);
+        std::move(normalized_path), std::move(key_prefix), *metadata_storage.getPathMap(), object_storage);
     addOperation(std::move(op));
 }
 
@@ -291,7 +172,7 @@ void MetadataStorageFromPlainObjectStorageTransaction::moveDirectory(const std::
         throwNotImplemented();
 
     addOperation(std::make_unique<MetadataStorageFromPlainObjectStorageMoveDirectoryOperation>(
-        normalizeDirectoryPath(path_from), normalizeDirectoryPath(path_to), *metadata_storage.path_map, object_storage));
+        normalizeDirectoryPath(path_from), normalizeDirectoryPath(path_to), *metadata_storage.getPathMap(), object_storage));
 }
 
 void MetadataStorageFromPlainObjectStorageTransaction::addBlobToMetadata(
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index 5e6a5b13ec4..0c30d9b48d3 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -24,7 +24,7 @@ using UnlinkMetadataFileOperationOutcomePtr = std::shared_ptr<UnlinkMetadataFile
 /// It is used to allow BACKUP/RESTORE to ObjectStorage (S3/...) with the same
 /// structure as on disk MergeTree, and does not requires metadata from local
 /// disk to restore.
-class MetadataStorageFromPlainObjectStorage final : public IMetadataStorage
+class MetadataStorageFromPlainObjectStorage : public IMetadataStorage
 {
 public:
     /// Local path prefixes mapped to storage key prefixes.
@@ -33,11 +33,11 @@ public:
 private:
     friend class MetadataStorageFromPlainObjectStorageTransaction;
 
+protected:
     ObjectStoragePtr object_storage;
     String storage_path_prefix;
 
     mutable SharedMutex metadata_mutex;
-    std::shared_ptr<PathMap> path_map;
 
 public:
     MetadataStorageFromPlainObjectStorage(ObjectStoragePtr object_storage_, String storage_path_prefix_);
@@ -77,6 +77,12 @@ public:
 
     bool supportsChmod() const override { return false; }
     bool supportsStat() const override { return false; }
+
+protected:
+    virtual std::shared_ptr<PathMap> getPathMap() const { throwNotImplemented(); }
+
+    virtual std::vector<std::string> getDirectChildrenOnDisk(
+        const std::string & storage_key, const RelativePathsWithMetadata & remote_paths, const std::string & local_path) const;
 };
 
 class MetadataStorageFromPlainObjectStorageTransaction final : public IMetadataTransaction, private MetadataOperationsHolder
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp
new file mode 100644
index 00000000000..4f987118fc5
--- /dev/null
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp
@@ -0,0 +1,141 @@
+#include <Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.h>
+
+#include <IO/ReadHelpers.h>
+#include <Common/ErrorCodes.h>
+#include <Common/logger_useful.h>
+#include "CommonPathPrefixKeyGenerator.h"
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+namespace
+{
+
+constexpr auto PREFIX_PATH_FILE_NAME = "prefix.path";
+
+MetadataStorageFromPlainObjectStorage::PathMap loadPathPrefixMap(const std::string & root, ObjectStoragePtr object_storage)
+{
+    MetadataStorageFromPlainObjectStorage::PathMap result;
+
+    RelativePathsWithMetadata files;
+    object_storage->listObjects(root, files, 0);
+    for (const auto & file : files)
+    {
+        auto remote_path = std::filesystem::path(file.relative_path);
+        if (remote_path.filename() != PREFIX_PATH_FILE_NAME)
+            continue;
+
+        StoredObject object{file.relative_path};
+
+        auto read_buf = object_storage->readObject(object);
+        String local_path;
+        readStringUntilEOF(local_path, *read_buf);
+
+        chassert(remote_path.has_parent_path());
+        auto res = result.emplace(local_path, remote_path.parent_path());
+
+        /// This can happen if table replication is enabled, then the same local path is written
+        /// in `prefix.path` of each replica.
+        /// TODO: should replicated tables (e.g., RMT) be explicitly disallowed?
+        if (!res.second)
+            LOG_WARNING(
+                getLogger("MetadataStorageFromPlainObjectStorage"),
+                "The local path '{}' is already mapped to a remote path '{}', ignoring: '{}'",
+                local_path,
+                res.first->second,
+                remote_path.parent_path().string());
+    }
+    return result;
+}
+
+std::vector<std::string> getDirectChildrenOnRewritableDisk(
+    const std::string & storage_key,
+    const RelativePathsWithMetadata & remote_paths,
+    const std::string & local_path,
+    const MetadataStorageFromPlainObjectStorage::PathMap & local_path_prefixes,
+    SharedMutex & shared_mutex)
+{
+    using PathMap = MetadataStorageFromPlainObjectStorage::PathMap;
+
+    std::unordered_set<std::string> duplicates_filter;
+
+    /// Map remote paths into local subdirectories.
+    std::unordered_map<PathMap::mapped_type, PathMap::key_type> remote_to_local_subdir;
+
+    {
+        std::shared_lock lock(shared_mutex);
+        for (const auto & [k, v] : local_path_prefixes)
+        {
+            if (!k.starts_with(local_path))
+                continue;
+
+            auto slash_num = count(k.begin() + local_path.size(), k.end(), '/');
+            if (slash_num != 1)
+                continue;
+
+            chassert(k.back() == '/');
+            remote_to_local_subdir.emplace(v, std::string(k.begin() + local_path.size(), k.end() - 1));
+        }
+    }
+
+    auto skip_list = std::set<std::string>{PREFIX_PATH_FILE_NAME};
+    for (const auto & elem : remote_paths)
+    {
+        const auto & path = elem.relative_path;
+        chassert(path.find(storage_key) == 0);
+        const auto child_pos = storage_key.size();
+
+        auto slash_pos = path.find('/', child_pos);
+
+        if (slash_pos == std::string::npos)
+        {
+            /// File names.
+            auto filename = path.substr(child_pos);
+            if (!skip_list.contains(filename))
+                duplicates_filter.emplace(std::move(filename));
+        }
+        else
+        {
+            /// Subdirectories.
+            auto it = remote_to_local_subdir.find(path.substr(0, slash_pos));
+            /// Mapped subdirectories.
+            if (it != remote_to_local_subdir.end())
+                duplicates_filter.emplace(it->second);
+            /// The remote subdirectory name is the same as the local subdirectory.
+            else
+                duplicates_filter.emplace(path.substr(child_pos, slash_pos - child_pos));
+        }
+    }
+
+    return std::vector<std::string>(std::make_move_iterator(duplicates_filter.begin()), std::make_move_iterator(duplicates_filter.end()));
+}
+
+}
+
+MetadataStorageFromPlainRewritableObjectStorage::MetadataStorageFromPlainRewritableObjectStorage(
+    ObjectStoragePtr object_storage_, String storage_path_prefix_)
+    : MetadataStorageFromPlainObjectStorage(object_storage_, storage_path_prefix_)
+    , path_map(std::make_shared<PathMap>(loadPathPrefixMap(object_storage->getCommonKeyPrefix(), object_storage)))
+{
+    if (object_storage->isWriteOnce())
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "MetadataStorageFromPlainRewritableObjectStorage is not compatible with write-once storage '{}'",
+            object_storage->getName());
+
+    auto keys_gen = std::make_shared<CommonPathPrefixKeyGenerator>(object_storage->getCommonKeyPrefix(), metadata_mutex, path_map);
+    object_storage->setKeysGenerator(keys_gen);
+}
+
+std::vector<std::string> MetadataStorageFromPlainRewritableObjectStorage::getDirectChildrenOnDisk(
+    const std::string & storage_key, const RelativePathsWithMetadata & remote_paths, const std::string & local_path) const
+{
+    return getDirectChildrenOnRewritableDisk(storage_key, remote_paths, local_path, *getPathMap(), metadata_mutex);
+}
+
+}
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.h
new file mode 100644
index 00000000000..4415a68c24e
--- /dev/null
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h>
+
+#include <memory>
+
+namespace DB
+{
+
+class MetadataStorageFromPlainRewritableObjectStorage final : public MetadataStorageFromPlainObjectStorage
+{
+private:
+    std::shared_ptr<PathMap> path_map;
+
+public:
+    MetadataStorageFromPlainRewritableObjectStorage(ObjectStoragePtr object_storage_, String storage_path_prefix_);
+
+    MetadataStorageType getType() const override { return MetadataStorageType::PlainRewritable; }
+
+protected:
+    std::shared_ptr<PathMap> getPathMap() const override { return path_map; }
+    std::vector<std::string> getDirectChildrenOnDisk(
+        const std::string & storage_key, const RelativePathsWithMetadata & remote_paths, const std::string & local_path) const override;
+};
+
+}
diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
index 958c6400a7d..7b949db268b 100644
--- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
+++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp
@@ -1,9 +1,10 @@
-#include "config.h"
+#include <utility>
 #include <Disks/ObjectStorages/ObjectStorageFactory.h>
+#include "Disks/DiskType.h"
+#include "config.h"
 #if USE_AWS_S3
 #include <Disks/ObjectStorages/S3/DiskS3Utils.h>
 #include <Disks/ObjectStorages/S3/S3ObjectStorage.h>
-#include <Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h>
 #include <Disks/ObjectStorages/S3/diskSettings.h>
 #endif
 #if USE_HDFS && !defined(CLICKHOUSE_KEEPER_STANDALONE_BUILD)
@@ -21,6 +22,7 @@
 #endif
 #include <Disks/ObjectStorages/MetadataStorageFactory.h>
 #include <Disks/ObjectStorages/PlainObjectStorage.h>
+#include <Disks/ObjectStorages/PlainRewritableObjectStorage.h>
 #include <Interpreters/Context.h>
 #include <Common/Macros.h>
 
@@ -36,36 +38,50 @@ namespace ErrorCodes
     extern const int UNKNOWN_ELEMENT_IN_CONFIG;
     extern const int BAD_ARGUMENTS;
     extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
 }
 
 namespace
 {
-    bool isPlainStorage(
-        ObjectStorageType type,
-        const Poco::Util::AbstractConfiguration & config,
-        const std::string & config_prefix)
-    {
-        auto compatibility_hint = MetadataStorageFactory::getCompatibilityMetadataTypeHint(type);
-        auto metadata_type = MetadataStorageFactory::getMetadataType(config, config_prefix, compatibility_hint);
-        return metadataTypeFromString(metadata_type) == MetadataStorageType::Plain;
-    }
 
-    template <typename BaseObjectStorage, class ...Args>
-    ObjectStoragePtr createObjectStorage(
-        ObjectStorageType type,
-        const Poco::Util::AbstractConfiguration & config,
-        const std::string & config_prefix,
-        Args && ...args)
+bool isCompatibleWithMetadataStorage(
+    ObjectStorageType storage_type,
+    const Poco::Util::AbstractConfiguration & config,
+    const std::string & config_prefix,
+    MetadataStorageType target_metadata_type)
+{
+    auto compatibility_hint = MetadataStorageFactory::getCompatibilityMetadataTypeHint(storage_type);
+    auto metadata_type = MetadataStorageFactory::getMetadataType(config, config_prefix, compatibility_hint);
+    return metadataTypeFromString(metadata_type) == target_metadata_type;
+}
+
+bool isPlainStorage(ObjectStorageType type, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
+{
+    return isCompatibleWithMetadataStorage(type, config, config_prefix, MetadataStorageType::Plain);
+}
+
+bool isPlainRewritableStorage(ObjectStorageType type, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
+{
+    return isCompatibleWithMetadataStorage(type, config, config_prefix, MetadataStorageType::PlainRewritable);
+}
+
+template <typename BaseObjectStorage, class... Args>
+ObjectStoragePtr createObjectStorage(
+    ObjectStorageType type, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Args &&... args)
+{
+    if (isPlainStorage(type, config, config_prefix))
+        return std::make_shared<PlainObjectStorage<BaseObjectStorage>>(std::forward<Args>(args)...);
+    else if (isPlainRewritableStorage(type, config, config_prefix))
     {
-        if (isPlainStorage(type, config, config_prefix))
-        {
-            return std::make_shared<PlainObjectStorage<BaseObjectStorage>>(std::forward<Args>(args)...);
-        }
-        else
-        {
-            return std::make_shared<BaseObjectStorage>(std::forward<Args>(args)...);
-        }
+        /// TODO(jkartseva@): Test support for generic disk type
+        if (type != ObjectStorageType::S3)
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "plain_rewritable metadata storage support is implemented only for S3");
+
+        return std::make_shared<PlainRewritableObjectStorage<BaseObjectStorage>>(std::forward<Args>(args)...);
     }
+    else
+        return std::make_shared<BaseObjectStorage>(std::forward<Args>(args)...);
+}
 }
 
 ObjectStorageFactory & ObjectStorageFactory::instance()
@@ -77,10 +93,7 @@ ObjectStorageFactory & ObjectStorageFactory::instance()
 void ObjectStorageFactory::registerObjectStorageType(const std::string & type, Creator creator)
 {
     if (!registry.emplace(type, creator).second)
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-                        "ObjectStorageFactory: the metadata type '{}' is not unique", type);
-    }
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "ObjectStorageFactory: the metadata type '{}' is not unique", type);
 }
 
 ObjectStoragePtr ObjectStorageFactory::create(
@@ -92,13 +105,9 @@ ObjectStoragePtr ObjectStorageFactory::create(
 {
     std::string type;
     if (config.has(config_prefix + ".object_storage_type"))
-    {
         type = config.getString(config_prefix + ".object_storage_type");
-    }
     else if (config.has(config_prefix + ".type")) /// .type -- for compatibility.
-    {
         type = config.getString(config_prefix + ".type");
-    }
     else
     {
         throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Expected `object_storage_type` in config");
@@ -235,7 +244,7 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory)
             auto client = getClient(config, config_prefix, context, *settings);
             auto key_generator = getKeyGenerator(uri, config, config_prefix);
 
-            auto object_storage = std::make_shared<S3PlainRewritableObjectStorage>(
+            auto object_storage = std::make_shared<PlainRewritableObjectStorage<S3ObjectStorage>>(
                 std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name);
 
             /// NOTE: should we still perform this check for clickhouse-disks?
@@ -251,26 +260,26 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory)
 #if USE_HDFS && !defined(CLICKHOUSE_KEEPER_STANDALONE_BUILD)
 void registerHDFSObjectStorage(ObjectStorageFactory & factory)
 {
-    factory.registerObjectStorageType("hdfs", [](
-        const std::string & /* name */,
-        const Poco::Util::AbstractConfiguration & config,
-        const std::string & config_prefix,
-        const ContextPtr & context,
-        bool /* skip_access_check */) -> ObjectStoragePtr
-    {
-        auto uri = context->getMacros()->expand(config.getString(config_prefix + ".endpoint"));
-        checkHDFSURL(uri);
-        if (uri.back() != '/')
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS path must ends with '/', but '{}' doesn't.", uri);
+    factory.registerObjectStorageType(
+        "hdfs",
+        [](const std::string & /* name */,
+           const Poco::Util::AbstractConfiguration & config,
+           const std::string & config_prefix,
+           const ContextPtr & context,
+           bool /* skip_access_check */) -> ObjectStoragePtr
+        {
+            auto uri = context->getMacros()->expand(config.getString(config_prefix + ".endpoint"));
+            checkHDFSURL(uri);
+            if (uri.back() != '/')
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "HDFS path must ends with '/', but '{}' doesn't.", uri);
 
-        std::unique_ptr<HDFSObjectStorageSettings> settings = std::make_unique<HDFSObjectStorageSettings>(
-            config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024),
-            config.getInt(config_prefix + ".objects_chunk_size_to_delete", 1000),
-            context->getSettingsRef().hdfs_replication
-        );
+            std::unique_ptr<HDFSObjectStorageSettings> settings = std::make_unique<HDFSObjectStorageSettings>(
+                config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024),
+                config.getInt(config_prefix + ".objects_chunk_size_to_delete", 1000),
+                context->getSettingsRef().hdfs_replication);
 
-        return createObjectStorage<HDFSObjectStorage>(ObjectStorageType::HDFS, config, config_prefix, uri, std::move(settings), config);
-    });
+            return createObjectStorage<HDFSObjectStorage>(ObjectStorageType::HDFS, config, config_prefix, uri, std::move(settings), config);
+        });
 }
 #endif
 
diff --git a/src/Disks/ObjectStorages/PlainRewritableObjectStorage.h b/src/Disks/ObjectStorages/PlainRewritableObjectStorage.h
new file mode 100644
index 00000000000..d71e995b490
--- /dev/null
+++ b/src/Disks/ObjectStorages/PlainRewritableObjectStorage.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <Disks/ObjectStorages/IObjectStorage.h>
+
+namespace DB
+{
+
+template <typename BaseObjectStorage>
+class PlainRewritableObjectStorage : public BaseObjectStorage
+{
+public:
+    template <class... Args>
+    explicit PlainRewritableObjectStorage(Args &&... args) : BaseObjectStorage(std::forward<Args>(args)...)
+    {
+    }
+
+    std::string getName() const override { return "PlainRewritable" + BaseObjectStorage::getName(); }
+
+    bool isWriteOnce() const override { return false; }
+
+    bool isPlain() const override { return true; }
+};
+
+}
diff --git a/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp b/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp
index 433ff00146b..0963dd37974 100644
--- a/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp
@@ -29,7 +29,10 @@ void registerDiskObjectStorage(DiskFactory & factory, bool global_skip_access_ch
         if (!config.has(config_prefix + ".metadata_type"))
         {
             if (object_storage->isPlain())
-                compatibility_metadata_type_hint = "plain";
+                if (object_storage->isWriteOnce())
+                    compatibility_metadata_type_hint = "plain";
+                else
+                    compatibility_metadata_type_hint = "plain_rewritable";
             else
                 compatibility_metadata_type_hint = MetadataStorageFactory::getCompatibilityMetadataTypeHint(object_storage->getType());
         }
@@ -53,7 +56,7 @@ void registerDiskObjectStorage(DiskFactory & factory, bool global_skip_access_ch
 #if USE_AWS_S3
     factory.registerDiskType("s3", creator); /// For compatibility
     factory.registerDiskType("s3_plain", creator); /// For compatibility
-    factory.registerDiskType("s3_plain_rewritable", creator);
+    factory.registerDiskType("s3_plain_rewritable", creator); // For compatibility
 #endif
 #if USE_HDFS
     factory.registerDiskType("hdfs", creator); /// For compatibility
diff --git a/src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h b/src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h
deleted file mode 100644
index 7f43f32370a..00000000000
--- a/src/Disks/ObjectStorages/S3/S3PlainRewritableObjectStorage.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#pragma once
-
-#include <Disks/ObjectStorages/S3/S3ObjectStorage.h>
-
-namespace DB
-{
-
-class S3PlainRewritableObjectStorage : public S3ObjectStorage
-{
-public:
-    template <class... Args>
-    explicit S3PlainRewritableObjectStorage(Args &&... args) : S3ObjectStorage(std::forward<Args>(args)...)
-    {
-    }
-
-    std::string getName() const override { return "S3PlainRewritableObjectStorage"; }
-
-    bool isWriteOnce() const override { return false; }
-
-    bool isPlain() const override { return true; }
-};
-
-}
diff --git a/tests/queries/0_stateless/03008_s3_plain_rewritable.sql b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
index 1bfd8118875..af02ebbd874 100644
--- a/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
+++ b/tests/queries/0_stateless/03008_s3_plain_rewritable.sql
@@ -2,8 +2,9 @@
 -- Tag: no-fasttest -- requires S3
 
 drop table if exists test_mt;
-create table test_mt (a Int32, b Int64, c Int64) engine = MergeTree() partition by intDiv(a, 500) order by tuple(a, b)
+create table test_mt (a Int32, b Int64, c Int64) engine = MergeTree() partition by intDiv(a, 1000) order by tuple(a, b)
 settings disk = disk(
+    name = s3_plain_rewritable,
     type = s3_plain_rewritable,
     endpoint = 'http://localhost:11111/test/test_mt/',
     access_key_id = clickhouse,
@@ -29,10 +30,6 @@ attach table test_mt;
 
 drop table if exists test_mt_dst;
 
-create table test_mt_dst (a Int32, b Int64, c Int64) engine = MergeTree() partition by intDiv(a, 500) order by tuple(a, b)
-settings disk = disk(
-    type = s3_plain_rewritable,
-    endpoint = 'http://localhost:11111/test/test_mt/',
-    access_key_id = clickhouse,
-    secret_access_key = clickhouse);
+create table test_mt_dst (a Int32, b Int64, c Int64) engine = MergeTree() partition by intDiv(a, 1000) order by tuple(a, b)
+settings disk = 's3_plain_rewritable';
 alter table test_mt move partition 0 to table test_mt_dst; -- { serverError SUPPORT_IS_DISABLED }

From c1d62dd8a1ea61aad3f526135dde5a461baa6d71 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Fri, 26 Apr 2024 22:56:23 +0000
Subject: [PATCH 115/192] documentation for unbundled configuration

---
 docs/en/operations/storing-data.md | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index a402a731933..389c917d427 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -28,7 +28,7 @@ Starting from 24.1 clickhouse version, it is possible to use a new configuration
 It requires to specify:
 1. `type` equal to `object_storage`
 2. `object_storage_type`, equal to one of `s3`, `azure_blob_storage` (or just `azure` from `24.3`), `hdfs`, `local_blob_storage` (or just `local` from `24.3`), `web`.
-Optionally, `metadata_type` can be specified (it is equal to `local` by default), but it can also be set to `plain`, `web`.
+Optionally, `metadata_type` can be specified (it is equal to `local` by default), but it can also be set to `plain`, `web` and, starting from `24.4`, `plain_rewritable`.
 Usage of `plain` metadata type is described in [plain storage section](/docs/en/operations/storing-data.md/#storing-data-on-webserver), `web` metadata type can be used only with `web` object storage type, `local` metadata type stores metadata files locally (each metadata files contains mapping to files in object storage and some additional meta information about them).
 
 E.g. configuration option
@@ -347,7 +347,9 @@ Similar to the `s3_plain` disk type, it does not require additional storage for
 Unlike `s3_plain` disk type, `s3_plain_rewritable` allows executing merges and supports INSERT operations.
 [Mutations](/docs/en/sql-reference/statements/alter#mutations) and replication of tables are not supported.
 
-A use case for this disk type are non-replicated `MergeTree` tables. e.g., system tables.
+A use case for this disk type are non-replicated `MergeTree` tables. Although the `s3` disk type is suitable for non-replicated
+MergeTree tables, you may opt for the `s3_plain_rewritable` disk type if you do not require local metadata for the table and are
+willing to accept a limited set of operations. This could be useful, for example, for system tables.
 
 Configuration:
 ``` xml
@@ -358,6 +360,17 @@ Configuration:
 </s3_plain_rewritable>
 ```
 
+is equal to
+``` xml
+<s3_plain_rewritable>
+    <type>object_storage</type>
+    <object_storage_type>s3</object_storage_type>
+    <metadata_type>plain_rewritable</metadata_type>
+    <endpoint>https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/</endpoint>
+    <use_environment_credentials>1</use_environment_credentials>
+</s3_plain_rewritable>
+```
+
 ### Using Azure Blob Storage {#azure-blob-storage}
 
 `MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`.

From 508a42bc8f6c9bcbeec7f71ac700f43a19818fca Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Sat, 27 Apr 2024 03:08:25 +0000
Subject: [PATCH 116/192] use ordered map for path map

---
 src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h    | 7 ++++---
 .../ObjectStorages/MetadataStorageFromPlainObjectStorage.h | 4 ++--
 .../MetadataStorageFromPlainRewritableObjectStorage.cpp    | 6 ++++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
index 68a390a51b6..fb1140de908 100644
--- a/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
+++ b/src/Disks/ObjectStorages/CommonPathPrefixKeyGenerator.h
@@ -3,7 +3,8 @@
 #include <Common/ObjectStorageKeyGenerator.h>
 #include <Common/SharedMutex.h>
 
-#include <unordered_map>
+#include <filesystem>
+#include <map>
 
 namespace DB
 {
@@ -20,8 +21,8 @@ namespace DB
 class CommonPathPrefixKeyGenerator : public IObjectStorageKeysGenerator
 {
 public:
-    /// Local to remote path map.
-    using PathMap = std::unordered_map<std::string, std::string>;
+    /// Local to remote path map. Leverages filesystem::path comparator for paths.
+    using PathMap = std::map<std::filesystem::path, std::string>;
 
     explicit CommonPathPrefixKeyGenerator(String key_prefix_, SharedMutex & shared_mutex_, std::weak_ptr<PathMap> path_map_);
 
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
index 0c30d9b48d3..f290a762205 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.h
@@ -5,7 +5,7 @@
 #include <Disks/ObjectStorages/MetadataOperationsHolder.h>
 #include <Disks/ObjectStorages/MetadataStorageTransactionState.h>
 
-#include <unordered_map>
+#include <map>
 
 namespace DB
 {
@@ -28,7 +28,7 @@ class MetadataStorageFromPlainObjectStorage : public IMetadataStorage
 {
 public:
     /// Local path prefixes mapped to storage key prefixes.
-    using PathMap = std::unordered_map<std::string, std::string>;
+    using PathMap = std::map<std::filesystem::path, std::string>;
 
 private:
     friend class MetadataStorageFromPlainObjectStorageTransaction;
diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp
index 4f987118fc5..d910dae80b4 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainRewritableObjectStorage.cpp
@@ -69,10 +69,12 @@ std::vector<std::string> getDirectChildrenOnRewritableDisk(
 
     {
         std::shared_lock lock(shared_mutex);
-        for (const auto & [k, v] : local_path_prefixes)
+        auto end_it = local_path_prefixes.end();
+        for (auto it = local_path_prefixes.lower_bound(local_path); it != end_it; ++it)
         {
+            const auto & [k, v] = std::make_tuple(it->first.string(), it->second);
             if (!k.starts_with(local_path))
-                continue;
+                break;
 
             auto slash_num = count(k.begin() + local_path.size(), k.end(), '/');
             if (slash_num != 1)

From 3c1207ed4de6222add178634550e4c9c53987457 Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Sat, 27 Apr 2024 05:35:56 +0000
Subject: [PATCH 117/192] remove path normalization

---
 .../MetadataStorageFromPlainObjectStorage.cpp              | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
index 963d3059b5b..071b2ff4613 100644
--- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp
@@ -14,14 +14,9 @@ namespace DB
 namespace
 {
 
-std::filesystem::path normalizePath(const std::filesystem::path & path)
-{
-    return std::filesystem::path(path).lexically_normal();
-}
-
 std::filesystem::path normalizeDirectoryPath(const std::filesystem::path & path)
 {
-    return normalizePath(path) / "";
+    return path / "";
 }
 
 }

From 29e70f500426fbed1af8f4fd999585b359ddfa2b Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Mon, 29 Apr 2024 21:04:44 +0300
Subject: [PATCH 118/192] In-source and Russian .md UUIDv7 docs synced with
 English .md docs

---
 .../sql-reference/functions/uuid-functions.md |  2 +-
 .../sql-reference/functions/uuid-functions.md | 30 ++++++++++---------
 src/Functions/generateUUIDv7.cpp              |  6 ++--
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index 56dea04cfeb..9912cf861ec 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -195,7 +195,7 @@ Result:
 SELECT generateUUIDv7ThreadMonotonic(1), generateUUIDv7ThreadMonotonic(2);
 
 ┌─generateUUIDv7ThreadMonotonic(1)─────┬─generateUUIDv7ThreadMonotonic(2)─────┐
-  018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
+│ 018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
diff --git a/docs/ru/sql-reference/functions/uuid-functions.md b/docs/ru/sql-reference/functions/uuid-functions.md
index f072d9d6863..a7fe6592338 100644
--- a/docs/ru/sql-reference/functions/uuid-functions.md
+++ b/docs/ru/sql-reference/functions/uuid-functions.md
@@ -53,10 +53,7 @@ SELECT generateUUIDv4(1), generateUUIDv4(2)
 
 ## generateUUIDv7 {#uuidv7-function-generate}
 
-Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, монотонно возрастающего счётчика для данной временной метки и случайных данных в указанной ниже последовательности. Для каждой новой временной метки счётчик стартует с нового случайного значения, а для следующих UUIDv7 он увеличивается на единицу. В случае переполнения счётчика временная метка принудительно увеличивается на 1, и счётчик снова стартует со случайного значения. Монотонность возрастания счётчика для каждой временной метки гарантируется между всеми одновременно работающими функциями `generateUUIDv7WithCounter`.
-::::note
-На апрель 2024 года UUIDv7 находится в статусе черновика и его раскладка по битам может в итоге измениться.
-::::
+Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, монотонно возрастающего счётчика для данной временной метки и случайных данных в указанной ниже последовательности. Для каждой новой временной метки счётчик стартует с нового случайного значения, а для следующих UUIDv7 он увеличивается на единицу. В случае переполнения счётчика временная метка принудительно увеличивается на 1, и счётчик снова стартует со случайного значения. Монотонность возрастания счётчика для каждой временной метки гарантируется между всеми одновременно работающими функциями `generateUUIDv7`.
 ```
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@@ -70,6 +67,10 @@ SELECT generateUUIDv4(1), generateUUIDv4(2)
 |                            rand_b                             |
 └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
 ```
+::::note
+На апрель 2024 года UUIDv7 находится в статусе черновика и его раскладка по битам может в итоге измениться.
+::::
+
 **Синтаксис**
 
 ``` sql
@@ -111,14 +112,14 @@ SELECT generateUUIDv7(1), generateUUIDv7(2)
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
 
-## generateUUIDv7WithFastCounter {#uuidv7withfastcounter-function-generate}
+## generateUUIDv7ThreadMonotonic {#uuidv7threadmonotonic-function-generate}
 
-Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Данная функция является ускоренным аналогом функции `generateUUIDv7WithCounter` за счёт потери гарантии монотонности счётчика при одной и той же метке времени между одновременно исполняемыми разными запросами. Монотонность счётчика гарантируется только в пределах одного треда, исполняющего данную функцию для генерации нескольких UUID.
+Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, монотонно возрастающего счётчика для данной временной метки и случайных данных в указанной ниже последовательности. Для каждой новой временной метки счётчик стартует с нового случайного значения, а для следующих UUIDv7 он увеличивается на единицу. В случае переполнения счётчика временная метка принудительно увеличивается на 1, и счётчик снова стартует со случайного значения. Данная функция является ускоренным аналогом функции `generateUUIDv7` за счёт потери гарантии монотонности счётчика при одной и той же метке времени между одновременно исполняемыми разными запросами. Монотонность счётчика гарантируется только в пределах одного треда, исполняющего данную функцию для генерации нескольких UUID.
 
 **Синтаксис**
 
 ``` sql
-generateUUIDv7WithFastCounter([x])
+generateUUIDv7ThreadMonotonic([x])
 ```
 
 **Аргументы**
@@ -136,7 +137,7 @@ generateUUIDv7WithFastCounter([x])
 ``` sql
 CREATE TABLE t_uuid (x UUID) ENGINE=TinyLog
 
-INSERT INTO t_uuid SELECT generateUUIDv7WithFastCounter()
+INSERT INTO t_uuid SELECT generateUUIDv7ThreadMonotonic()
 
 SELECT * FROM t_uuid
 ```
@@ -150,8 +151,9 @@ SELECT * FROM t_uuid
 **Пример использования, для генерации нескольких значений в одной строке**
 
 ```sql
-SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(7)
-┌─generateUUIDv7WithFastCounter(1)─────┬─generateUUIDv7WithFastCounter(2)─────┐
+SELECT generateUUIDv7ThreadMonotonic(1), generateUUIDv7ThreadMonotonic(7)
+
+┌─generateUUIDv7ThreadMonotonic(1)─────┬─generateUUIDv7ThreadMonotonic(2)─────┐
 │ 018f05e1-14ee-7bc5-9906-207153b400b1 │ 018f05e1-14ee-7bc5-9906-2072b8e96758 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
 ```
@@ -159,9 +161,6 @@ SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(7)
 ## generateUUIDv7NonMonotonic {#uuidv7nonmonotonic-function-generate}
 
 Генерирует идентификатор [UUID версии 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04). Генерируемый UUID состоит из 48-битной временной метки (Unix time в миллисекундах), маркеров версии 7 и варианта 2, и случайных данных в следующей последовательности:
-::::note
-На апрель 2024 года UUIDv7 находится в статусе черновика и его раскладка по битам может в итоге измениться.
-::::
 ```
  0                   1                   2                   3
  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
@@ -175,6 +174,9 @@ SELECT generateUUIDv7WithFastCounter(1), generateUUIDv7WithFastCounter(7)
 |                            rand_b                             |
 └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
 ```
+::::note
+На апрель 2024 года UUIDv7 находится в статусе черновика и его раскладка по битам может в итоге измениться.
+::::
 
 **Синтаксис**
 
@@ -211,7 +213,7 @@ SELECT * FROM t_uuid
 **Пример использования, для генерации нескольких значений в одной строке**
 
 ```sql
-SELECT generateUUIDv7(1), generateUUIDv7(7)
+SELECT generateUUIDv7NonMonotonic(1), generateUUIDv7NonMonotonic(7)
 ┌─generateUUIDv7NonMonotonic(1)────────┬─generateUUIDv7NonMonotonic(2)────────┐
 │ 018f05b1-8c2e-7567-a988-48d09606ae8c │ 018f05b1-8c2e-7946-895b-fcd7635da9a0 │
 └──────────────────────────────────────┴──────────────────────────────────────┘
diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index edbfe0594d3..d70860d9967 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -76,7 +76,7 @@ void setVariant(UUID & uuid)
 struct FillAllRandomPolicy
 {
     static constexpr auto name = "generateUUIDv7NonMonotonic";
-    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), and a random field (74 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit). It is the fastest version of generateUUIDv7* functions family.";
+    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), and a random field (74 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit). This function is the fastest generateUUIDv7* function but it gives no monotonocity guarantees within a timestamp.";
     struct Data
     {
         void generate(UUID & uuid, uint64_t ts)
@@ -136,7 +136,7 @@ struct CounterFields
 struct GlobalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7";
-    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotonocity at one timestamp is guaraneed across all generateUUIDv7 functions running simultaneously.";
+    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.";
 
     /// Guarantee counter monotonocity within one timestamp across all threads generating UUIDv7 simultaneously.
     struct Data
@@ -159,7 +159,7 @@ struct GlobalCounterPolicy
 struct ThreadLocalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7ThreadMonotonic";
-    static constexpr auto doc_description = "Generates a UUID of version 7 containing the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). Counter increment monotonocity at one timestamp is guaraneed only within one thread running generateUUIDv7 function.";
+    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonocity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.";
 
     /// Guarantee counter monotonocity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads.
     struct Data

From 4ac190fe1588a6ef5c49fe0797c10cf575abaca5 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Mon, 29 Apr 2024 21:27:00 +0300
Subject: [PATCH 119/192] Added negative tests for UUIDToNum and
 UUIDv7ToDateTime functions

---
 tests/queries/0_stateless/00396_uuid_v7.sql | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tests/queries/0_stateless/00396_uuid_v7.sql b/tests/queries/0_stateless/00396_uuid_v7.sql
index 7c1449dd620..13897dee210 100644
--- a/tests/queries/0_stateless/00396_uuid_v7.sql
+++ b/tests/queries/0_stateless/00396_uuid_v7.sql
@@ -1,6 +1,17 @@
 SELECT '-- UUIDToNum --';
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 1);
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 2) = UUIDStringToNum('00112233-4455-6677-8899-aabbccddeeff', 2);
+SELECT UUIDToNum();  -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1, 2); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 3); -- { serverError ARGUMENT_OUT_OF_BOUND }
+SELECT UUIDToNum('00112233-4455-6677-8899-aabbccddeeff', 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), '1'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 
 SELECT '-- UUIDv7toDateTime --';
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
+SELECT UUIDv7ToDateTime(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 1) -- { serverError ILLEGAL_COLUMN }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York', 1);  -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+SELECT UUIDv7ToDateTime('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1');  -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/NewYork'); -- { serverError BAD_ARGUMENTS }
+

From 987558e18dc2966d4444e3c157659dd36cba23b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 29 Apr 2024 20:46:37 +0200
Subject: [PATCH 120/192] Improvements based on review

---
 src/Core/SettingsQuirks.cpp                    | 12 ++++++++++--
 src/Core/SettingsQuirks.h                      |  2 +-
 src/Interpreters/Context.cpp                   | 18 +++++++++---------
 .../02994_sanity_check_settings.reference      |  1 +
 .../02994_sanity_check_settings.sql            |  5 +++++
 5 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp
index a5bb9a4ef0d..0dd2d80caf6 100644
--- a/src/Core/SettingsQuirks.cpp
+++ b/src/Core/SettingsQuirks.cpp
@@ -90,7 +90,7 @@ void applySettingsQuirks(Settings & settings, LoggerPtr log)
     }
 }
 
-void doSettingsSanityCheckClamp(Settings & current_settings)
+void doSettingsSanityCheckClamp(Settings & current_settings, LoggerPtr log)
 {
     auto getCurrentValue = [&current_settings](const std::string_view name) -> Field
     {
@@ -101,9 +101,13 @@ void doSettingsSanityCheckClamp(Settings & current_settings)
     };
 
     UInt64 max_threads = getCurrentValue("max_threads").get<UInt64>();
-    UInt64 max_threads_max_value = 256 * getNumberOfPhysicalCPUCores();
+    UInt64 max_threads_max_value = 65536 * getNumberOfPhysicalCPUCores();
     if (max_threads > max_threads_max_value)
+    {
+        if (log)
+            LOG_WARNING(log, "Sanity check: Too many threads requested ({}). Reduced to {}", max_threads, max_threads_max_value);
         current_settings.set("max_threads", max_threads_max_value);
+    }
 
     constexpr UInt64 max_sane_block_rows_size = 4294967296; // 2^32
     std::unordered_set<String> block_rows_settings{
@@ -118,7 +122,11 @@ void doSettingsSanityCheckClamp(Settings & current_settings)
     {
         auto block_size = getCurrentValue(setting).get<UInt64>();
         if (block_size > max_sane_block_rows_size)
+        {
+            if (log)
+                LOG_WARNING(log, "Sanity check: '{}' value is too high ({}). Reduced to {}", setting, block_size, max_sane_block_rows_size);
             current_settings.set(setting, max_sane_block_rows_size);
+        }
     }
 }
 }
diff --git a/src/Core/SettingsQuirks.h b/src/Core/SettingsQuirks.h
index 012d8d4b524..58e181b3ebd 100644
--- a/src/Core/SettingsQuirks.h
+++ b/src/Core/SettingsQuirks.h
@@ -11,5 +11,5 @@ struct Settings;
 void applySettingsQuirks(Settings & settings, LoggerPtr log = nullptr);
 
 /// Verify that some settings have sane values. Alters the value to a reasonable one if not
-void doSettingsSanityCheckClamp(Settings & settings);
+void doSettingsSanityCheckClamp(Settings & settings, LoggerPtr log);
 }
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index c2bb14017e1..726b480930a 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -1378,14 +1378,14 @@ contextSanityClampSettingsWithLock(const Context & context, Settings & settings,
 {
     const auto type = context.getApplicationType();
     if (type == Context::ApplicationType::LOCAL || type == Context::ApplicationType::SERVER)
-        doSettingsSanityCheckClamp(settings);
+        doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 
 ALWAYS_INLINE inline void contextSanityClampSettings(const Context & context, Settings & settings)
 {
     const auto type = context.getApplicationType();
     if (type == Context::ApplicationType::LOCAL || type == Context::ApplicationType::SERVER)
-        doSettingsSanityCheckClamp(settings);
+        doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 }
 
@@ -2191,35 +2191,35 @@ void Context::checkSettingsConstraintsWithLock(const SettingsProfileElements & p
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, profile_elements, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheckClamp(settings);
+        doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 
 void Context::checkSettingsConstraintsWithLock(const SettingChange & change, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, change, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheckClamp(settings);
+        doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 
 void Context::checkSettingsConstraintsWithLock(const SettingsChanges & changes, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, changes, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheckClamp(settings);
+        doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 
 void Context::checkSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, changes, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheckClamp(settings);
+        doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 
 void Context::clampToSettingsConstraintsWithLock(SettingsChanges & changes, SettingSource source)
 {
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.clamp(settings, changes, source);
     if (getApplicationType() == ApplicationType::LOCAL || getApplicationType() == ApplicationType::SERVER)
-        doSettingsSanityCheckClamp(settings);
+        doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 
 void Context::checkMergeTreeSettingsConstraintsWithLock(const MergeTreeSettings & merge_tree_settings, const SettingsChanges & changes) const
@@ -2243,7 +2243,7 @@ void Context::checkSettingsConstraints(const SettingsChanges & changes, SettingS
 {
     SharedLockGuard lock(mutex);
     getSettingsConstraintsAndCurrentProfilesWithLock()->constraints.check(settings, changes, source);
-    doSettingsSanityCheckClamp(settings);
+    doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 }
 
 void Context::checkSettingsConstraints(SettingsChanges & changes, SettingSource source)
@@ -4484,7 +4484,7 @@ void Context::setDefaultProfiles(const Poco::Util::AbstractConfiguration & confi
     setCurrentProfile(shared->system_profile_name);
 
     applySettingsQuirks(settings, getLogger("SettingsQuirks"));
-    doSettingsSanityCheckClamp(settings);
+    doSettingsSanityCheckClamp(settings, getLogger("SettingsSanity"));
 
     shared->buffer_profile_name = config.getString("buffer_profile", shared->system_profile_name);
     buffer_context = Context::createCopy(shared_from_this());
diff --git a/tests/queries/0_stateless/02994_sanity_check_settings.reference b/tests/queries/0_stateless/02994_sanity_check_settings.reference
index da077c31fd0..b9992764c78 100644
--- a/tests/queries/0_stateless/02994_sanity_check_settings.reference
+++ b/tests/queries/0_stateless/02994_sanity_check_settings.reference
@@ -16,3 +16,4 @@ ExpressionTransform
   Limit
     (ReadFromStorage)
     Zeros 0 → 1
+4294967296
diff --git a/tests/queries/0_stateless/02994_sanity_check_settings.sql b/tests/queries/0_stateless/02994_sanity_check_settings.sql
index 41c94eef4b3..6aaa448403b 100644
--- a/tests/queries/0_stateless/02994_sanity_check_settings.sql
+++ b/tests/queries/0_stateless/02994_sanity_check_settings.sql
@@ -1,3 +1,4 @@
+SET send_logs_level = 'error';
 CREATE TABLE data_02052_1_wide0__fuzz_48
 (
     `key` Nullable(Int64),
@@ -22,3 +23,7 @@ FROM system.zeros LIMIT 10
     SETTINGS max_block_size = 9223372036854775806, max_rows_to_read = 20, read_overflow_mode = 'break';
 
 EXPLAIN PIPELINE SELECT zero + 1 AS x FROM system.zeros LIMIT 10 SETTINGS max_block_size = 9223372036854775806, max_rows_to_read = 20, read_overflow_mode = 'break';
+
+-- Verify that we clamp odd values to something slightly saner
+SET max_block_size = 9223372036854775806;
+SELECT value FROM system.settings WHERE name = 'max_block_size';

From 16727097b2be52d8b455f247ab39baeaa4deeff2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 29 Apr 2024 20:55:53 +0200
Subject: [PATCH 121/192] Fix client crash

---
 src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h
index b04c1b0c16e..3e8069cba0d 100644
--- a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h
+++ b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h
@@ -92,7 +92,7 @@ public:
 
         /// Just heuristic. We need one thread for collecting, one thread for receiving chunks
         /// and n threads for formatting.
-        processing_units.resize(params.max_threads_for_parallel_formatting + 2);
+        processing_units.resize(std::min(params.max_threads_for_parallel_formatting + 2, size_t{1024}));
 
         /// Do not put any code that could throw an exception under this line.
         /// Because otherwise the destructor of this class won't be called and this thread won't be joined.

From 5355decc738806620ffa2b1a6103f245d04099c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 29 Apr 2024 21:08:56 +0200
Subject: [PATCH 122/192] Give the server a chance

---
 src/Core/SettingsQuirks.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Core/SettingsQuirks.cpp b/src/Core/SettingsQuirks.cpp
index 0dd2d80caf6..5e7d02dc448 100644
--- a/src/Core/SettingsQuirks.cpp
+++ b/src/Core/SettingsQuirks.cpp
@@ -101,7 +101,7 @@ void doSettingsSanityCheckClamp(Settings & current_settings, LoggerPtr log)
     };
 
     UInt64 max_threads = getCurrentValue("max_threads").get<UInt64>();
-    UInt64 max_threads_max_value = 65536 * getNumberOfPhysicalCPUCores();
+    UInt64 max_threads_max_value = 256 * getNumberOfPhysicalCPUCores();
     if (max_threads > max_threads_max_value)
     {
         if (log)

From f7ad885725909a8f7a572508c9d7a963bd3ffef1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 29 Apr 2024 21:38:45 +0200
Subject: [PATCH 123/192] Try to fix coverage tests

---
 tests/clickhouse-test                                  | 10 ++++++----
 .../02992_all_columns_should_have_comment.sql          |  2 +-
 .../0_stateless/03096_order_by_system_tables.sql       |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index e31b9a0bdc7..63d2fbc1b68 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -734,9 +734,9 @@ def get_localzone():
 
 class SettingsRandomizer:
     settings = {
-        "max_insert_threads": lambda: 0
-        if random.random() < 0.5
-        else random.randint(1, 16),
+        "max_insert_threads": lambda: (
+            0 if random.random() < 0.5 else random.randint(1, 16)
+        ),
         "group_by_two_level_threshold": threshold_generator(0.2, 0.2, 1, 1000000),
         "group_by_two_level_threshold_bytes": threshold_generator(
             0.2, 0.2, 1, 50000000
@@ -1484,7 +1484,7 @@ class TestCase:
                     body = read_file_as_binary_string(file_path)
                     clickhouse_execute(
                         args,
-                        f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary",
+                        f"INSERT INTO system.coverage_log SETTINGS async_insert=1, wait_for_async_insert=0 SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary",
                         body=body,
                         retry_error_codes=True,
                     )
@@ -1493,6 +1493,8 @@ class TestCase:
                 # Remove the file even in case of exception to avoid accumulation and quadratic complexity.
                 os.remove(file_path)
 
+            _ = clickhouse_execute(args, "SYSTEM FLUSH ASYNC INSERT QUEUE")
+
             coverage = clickhouse_execute(
                 args,
                 "SELECT length(coverageCurrent())",
diff --git a/tests/queries/0_stateless/02992_all_columns_should_have_comment.sql b/tests/queries/0_stateless/02992_all_columns_should_have_comment.sql
index e233f08cc79..127c6fee07d 100644
--- a/tests/queries/0_stateless/02992_all_columns_should_have_comment.sql
+++ b/tests/queries/0_stateless/02992_all_columns_should_have_comment.sql
@@ -1,4 +1,4 @@
 SYSTEM FLUSH LOGS;
 SELECT 'Column ' || name || ' from table ' || concat(database, '.', table) || ' should have a comment'
 FROM system.columns
-WHERE (database = 'system') AND (comment = '') AND (table NOT ILIKE '%_log_%') AND (table NOT IN ('numbers', 'numbers_mt', 'one', 'generate_series', 'generateSeries')) AND (default_kind != 'ALIAS');
+WHERE (database = 'system') AND (comment = '') AND (table NOT ILIKE '%_log_%') AND (table NOT IN ('numbers', 'numbers_mt', 'one', 'generate_series', 'generateSeries', 'coverage_log')) AND (default_kind != 'ALIAS');
diff --git a/tests/queries/0_stateless/03096_order_by_system_tables.sql b/tests/queries/0_stateless/03096_order_by_system_tables.sql
index 37124ad239d..b9e6d5b3c8c 100644
--- a/tests/queries/0_stateless/03096_order_by_system_tables.sql
+++ b/tests/queries/0_stateless/03096_order_by_system_tables.sql
@@ -2,7 +2,7 @@ SYSTEM FLUSH LOGS;
 
 -- Check for system tables which have non-default sorting key
 WITH
-    ['asynchronous_metric_log', 'asynchronous_insert_log', 'opentelemetry_span_log'] AS known_tables,
+    ['asynchronous_metric_log', 'asynchronous_insert_log', 'opentelemetry_span_log', 'coverage_log'] AS known_tables,
     'event_date, event_time' as default_sorting_key
 SELECT
     'Table ' || name || ' has non-default sorting key: ' || sorting_key

From dc955589626b3ea02616649b31df71adeae31c0e Mon Sep 17 00:00:00 2001
From: Julia Kartseva <yulia.kartseva@gmail.com>
Date: Mon, 29 Apr 2024 19:48:17 +0000
Subject: [PATCH 124/192] method rename

---
 src/Disks/IDisk.h                              | 3 +--
 src/Disks/ObjectStorages/DiskObjectStorage.cpp | 2 +-
 src/Disks/ObjectStorages/DiskObjectStorage.h   | 2 +-
 src/Storages/MergeTree/MergeTreeData.cpp       | 6 +++---
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index 83cf3d53057..b84d60c4591 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -363,8 +363,7 @@ public:
 
     virtual bool isWriteOnce() const { return false; }
 
-    /// Whether this disk support mutations.
-    virtual bool isMutable() const { return true; }
+    virtual bool supportsHardLinks() const { return true; }
 
     /// Check if disk is broken. Broken disks will have 0 space and cannot be used.
     virtual bool isBroken() const { return false; }
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
index 70eb0ec6706..f6980d1e8f1 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@@ -435,7 +435,7 @@ bool DiskObjectStorage::isWriteOnce() const
     return object_storage->isWriteOnce();
 }
 
-bool DiskObjectStorage::isMutable() const
+bool DiskObjectStorage::supportsHardLinks() const
 {
     return !isWriteOnce() && !object_storage->isPlain();
 }
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h
index 8a7e52cee6a..787937af846 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.h
@@ -183,7 +183,7 @@ public:
     /// MergeTree table on this disk.
     bool isWriteOnce() const override;
 
-    bool isMutable() const override;
+    bool supportsHardLinks() const override;
 
     /// Get structure of object storage this disk works with. Examples:
     /// DiskObjectStorage(S3ObjectStorage)
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index c7c2b940152..0fa786c0c57 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -2965,7 +2965,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
                 "Experimental Inverted Index feature is not enabled (turn on setting 'allow_experimental_inverted_index')");
 
     for (const auto & disk : getDisks())
-        if (!disk->isMutable())
+        if (!disk->supportsHardLinks())
             throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ALTER TABLE is not supported for immutable disk '{}'", disk->getName());
 
     /// Set of columns that shouldn't be altered.
@@ -3339,7 +3339,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context
 void MergeTreeData::checkMutationIsPossible(const MutationCommands & /*commands*/, const Settings & /*settings*/) const
 {
     for (const auto & disk : getDisks())
-        if (!disk->isMutable())
+        if (!disk->supportsHardLinks())
             throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Mutations are not supported for immutable disk '{}'", disk->getName());
 }
 
@@ -4831,7 +4831,7 @@ void MergeTreeData::checkAlterPartitionIsPossible(
     const PartitionCommands & commands, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & settings, ContextPtr local_context) const
 {
     for (const auto & disk : getDisks())
-        if (!disk->isMutable())
+        if (!disk->supportsHardLinks())
             throw Exception(
                 ErrorCodes::SUPPORT_IS_DISABLED, "ALTER TABLE PARTITION is not supported for immutable disk '{}'", disk->getName());
 

From c8847d8f97bbd726d7f00512e51d249b56ac6385 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 30 Apr 2024 00:15:04 +0300
Subject: [PATCH 125/192] Fixes for spell-checker

---
 docs/en/sql-reference/functions/uuid-functions.md  | 4 ++--
 src/Functions/generateUUIDv7.cpp                   | 2 +-
 utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index 9912cf861ec..0ff3878dd91 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -135,7 +135,7 @@ For any given timestamp (unix_ts_ms), the counter starts at a random value and i
 In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value.
 
 This function behaves like [generateUUIDv7](#generateUUIDv7) but gives no guarantee on counter monotony across different simultaneous requests.
-Monotonocity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.
+Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.
 
 ```
  0                   1                   2                   3
@@ -205,7 +205,7 @@ Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.iet
 
 The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits) and random values (76 bits, including a 2-bit variant field "2").
 
-This function is the fastest `generateUUIDv7*` function but it gives no monotonocity guarantees within a timestamp.
+This function is the fastest `generateUUIDv7*` function but it gives no monotonicity guarantees within a timestamp.
 
 ```
  0                   1                   2                   3
diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index d70860d9967..bb12f250da5 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -159,7 +159,7 @@ struct GlobalCounterPolicy
 struct ThreadLocalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7ThreadMonotonic";
-    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonocity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.";
+    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.";
 
     /// Guarantee counter monotonocity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads.
     struct Data
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index 9f01e50e066..79583b35a0a 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -563,6 +563,7 @@ MinIO
 MinMax
 MindsDB
 Mongodb
+Monotonicity
 mortonDecode
 mortonEncode
 MsgPack
@@ -925,6 +926,7 @@ TAVG
 TCPConnection
 TCPThreads
 TDigest
+ThreadMonotonic
 TINYINT
 TLSv
 TMAX

From 918abe812b9afdf9a5fad1fe0a8c6bc9b84d0f7b Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 30 Apr 2024 01:36:06 +0300
Subject: [PATCH 126/192] Fixed test in 00396_uuid_v7.sql

---
 tests/queries/0_stateless/00396_uuid_v7.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/00396_uuid_v7.sql b/tests/queries/0_stateless/00396_uuid_v7.sql
index 13897dee210..48b8a940172 100644
--- a/tests/queries/0_stateless/00396_uuid_v7.sql
+++ b/tests/queries/0_stateless/00396_uuid_v7.sql
@@ -10,7 +10,7 @@ SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), '1'); -- { serv
 SELECT '-- UUIDv7toDateTime --';
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
 SELECT UUIDv7ToDateTime(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 1) -- { serverError ILLEGAL_COLUMN }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 1); -- { serverError ILLEGAL_COLUMN }
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York', 1);  -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
 SELECT UUIDv7ToDateTime('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1');  -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/NewYork'); -- { serverError BAD_ARGUMENTS }

From bf3c8b6966d01d62d71fa3b9c3b45cbf7d7be7ed Mon Sep 17 00:00:00 2001
From: HarryLeeIBM <hleeatwork@outlook.com>
Date: Mon, 29 Apr 2024 15:37:10 -0700
Subject: [PATCH 127/192] Fix broken s390x build

---
 contrib/openssl-cmake/CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/contrib/openssl-cmake/CMakeLists.txt b/contrib/openssl-cmake/CMakeLists.txt
index 81f4febf117..f4ad00e4402 100644
--- a/contrib/openssl-cmake/CMakeLists.txt
+++ b/contrib/openssl-cmake/CMakeLists.txt
@@ -191,6 +191,9 @@ elseif(ARCH_S390X)
 
     perl_generate_asm(${OPENSSL_SOURCE_DIR}/crypto/aes/asm/aes-s390x.pl ${OPENSSL_BINARY_DIR}/crypto/aes/aes-s390x.S)
     perl_generate_asm(${OPENSSL_SOURCE_DIR}/crypto/s390xcpuid.pl ${OPENSSL_BINARY_DIR}/crypto/s390xcpuid.S)
+    perl_generate_asm(${OPENSSL_SOURCE_DIR}/crypto/chacha/asm/chacha-s390x.pl ${OPENSSL_BINARY_DIR}/crypto/chacha/chacha-s390x.S)
+    perl_generate_asm(${OPENSSL_SOURCE_DIR}/crypto/rc4/asm/rc4-s390x.pl ${OPENSSL_BINARY_DIR}/crypto/rc4/rc4-s390x.S)
+    perl_generate_asm(${OPENSSL_SOURCE_DIR}/crypto/sha/asm/keccak1600-s390x.pl ${OPENSSL_BINARY_DIR}/crypto/sha/keccak1600-s390x.S)
 elseif(ARCH_RISCV64)
     macro(perl_generate_asm FILE_IN FILE_OUT)
         add_custom_command(OUTPUT ${FILE_OUT}
@@ -1290,6 +1293,15 @@ elseif(ARCH_S390X)
     set(CRYPTO_SRC ${CRYPTO_SRC}
         ${OPENSSL_BINARY_DIR}/crypto/aes/aes-s390x.S
         ${OPENSSL_BINARY_DIR}/crypto/s390xcpuid.S
+        ${OPENSSL_SOURCE_DIR}/crypto/bn/asm/s390x.S
+        ${OPENSSL_SOURCE_DIR}/crypto/s390xcap.c
+        ${OPENSSL_SOURCE_DIR}/crypto/bn/bn_s390x.c
+        ${OPENSSL_SOURCE_DIR}/crypto/camellia/camellia.c
+        ${OPENSSL_SOURCE_DIR}/crypto/camellia/cmll_cbc.c
+        ${OPENSSL_BINARY_DIR}/crypto/chacha/chacha-s390x.S
+        ${OPENSSL_BINARY_DIR}/crypto/rc4/rc4-s390x.S
+        ${OPENSSL_BINARY_DIR}/crypto/sha/keccak1600-s390x.S
+        ${OPENSSL_SOURCE_DIR}/crypto/whrlpool/wp_block.c
     )
 elseif(ARCH_RISCV64)
     set(CRYPTO_SRC ${CRYPTO_SRC}

From 2c0e8809096f41beaf1db7815d0a596e79fda0e0 Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Mon, 29 Apr 2024 20:50:48 -0400
Subject: [PATCH 128/192] Format SQL security option only in `CREATE VIEW`
 queries.

---
 src/Interpreters/InterpreterCreateQuery.cpp | 2 +-
 src/Parsers/ASTCreateQuery.cpp              | 2 +-
 src/Parsers/ASTCreateQuery.h                | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
index 7cda0267fdf..29c6c101910 100644
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -1077,7 +1077,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
     String current_database = getContext()->getCurrentDatabase();
     auto database_name = create.database ? create.getDatabase() : current_database;
 
-    if (!create.sql_security && !getContext()->getServerSettings().ignore_empty_sql_security_in_create_view_query)
+    if (!create.sql_security && create.supportSQLSecurity() && !getContext()->getServerSettings().ignore_empty_sql_security_in_create_view_query)
         create.sql_security = std::make_shared<ASTSQLSecurity>();
 
     if (create.sql_security)
diff --git a/src/Parsers/ASTCreateQuery.cpp b/src/Parsers/ASTCreateQuery.cpp
index 0403dc33164..8d4373530bc 100644
--- a/src/Parsers/ASTCreateQuery.cpp
+++ b/src/Parsers/ASTCreateQuery.cpp
@@ -485,7 +485,7 @@ void ASTCreateQuery::formatQueryImpl(const FormatSettings & settings, FormatStat
     else if (is_create_empty)
         settings.ostr << (settings.hilite ? hilite_keyword : "") << " EMPTY" << (settings.hilite ? hilite_none : "");
 
-    if (sql_security && sql_security->as<ASTSQLSecurity &>().type.has_value())
+    if (sql_security && supportSQLSecurity() && sql_security->as<ASTSQLSecurity &>().type.has_value())
     {
         settings.ostr << settings.nl_or_ws;
         sql_security->formatImpl(settings, state, frame);
diff --git a/src/Parsers/ASTCreateQuery.h b/src/Parsers/ASTCreateQuery.h
index 64e6bc8ce48..6fbf045915b 100644
--- a/src/Parsers/ASTCreateQuery.h
+++ b/src/Parsers/ASTCreateQuery.h
@@ -149,6 +149,8 @@ public:
 
     bool isParameterizedView() const;
 
+    bool supportSQLSecurity() const { return is_ordinary_view || is_materialized_view; }
+
     QueryKind getQueryKind() const override { return QueryKind::Create; }
 
     struct UUIDs

From 308fc05009e1320cf8e67ce601a9aec359a20b51 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Tue, 30 Apr 2024 07:33:46 +0200
Subject: [PATCH 129/192] Changelog for 24.4

---
 CHANGELOG.md | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f40c42c4462..629d573f8ab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
 ### Table of Contents
+**[ClickHouse release v24.4, 2024-04-30](#244)**<br/>
 **[ClickHouse release v24.3 LTS, 2024-03-26](#243)**<br/>
 **[ClickHouse release v24.2, 2024-02-29](#242)**<br/>
 **[ClickHouse release v24.1, 2024-01-30](#241)**<br/>
@@ -6,6 +7,169 @@
 
 # 2024 Changelog
 
+### ### <a id="244"></a> ClickHouse release 24.4 LTS, 2024-04-30
+
+#### Upgrade Notes
+* `clickhouse-odbc-bridge` and `clickhouse-library-bridge` are now separate packages. This closes [#61677](https://github.com/ClickHouse/ClickHouse/issues/61677). [#62114](https://github.com/ClickHouse/ClickHouse/pull/62114) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Don't allow to set max_parallel_replicas (for the experimental parallel reading from replicas) to `0` as it doesn't make sense. Closes [#60140](https://github.com/ClickHouse/ClickHouse/issues/60140). [#61201](https://github.com/ClickHouse/ClickHouse/pull/61201) ([Kruglov Pavel](https://github.com/Avogar)).
+* Remove support for `INSERT WATCH` query (part of the deprecated `LIVE VIEW` feature). [#62382](https://github.com/ClickHouse/ClickHouse/pull/62382) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Removed the `optimize_monotonous_functions_in_order_by` setting. [#63004](https://github.com/ClickHouse/ClickHouse/pull/63004) ([Raúl Marín](https://github.com/Algunenano)).
+* Remove experimental tag from the `Replicated` database engine. Now it is in Beta stage. [#62937](https://github.com/ClickHouse/ClickHouse/pull/62937) ([Justin de Guzman](https://github.com/justindeguzman)).
+
+#### New Feature
+* Support recursive CTEs. [#62074](https://github.com/ClickHouse/ClickHouse/pull/62074) ([Maksim Kita](https://github.com/kitaisreal)).
+* Support `QUALIFY` clause. Closes [#47819](https://github.com/ClickHouse/ClickHouse/issues/47819). [#62619](https://github.com/ClickHouse/ClickHouse/pull/62619) ([Maksim Kita](https://github.com/kitaisreal)).
+* Table engines are grantable now, and it won't affect existing users behavior. [#60117](https://github.com/ClickHouse/ClickHouse/pull/60117) ([jsc0218](https://github.com/jsc0218)).
+* Added a rewritable S3 disk which supports INSERT operations and does not require locally stored metadata. [#61116](https://github.com/ClickHouse/ClickHouse/pull/61116) ([Julia Kartseva](https://github.com/jkartseva)). The main use case is for system tables.
+* The syntax highlighting while typing in the client will work on the syntax level (previously, it worked on the lexer level). [#62123](https://github.com/ClickHouse/ClickHouse/pull/62123) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Supports dropping multiple tables at the same time like `DROP TABLE a, b, c`;. [#58705](https://github.com/ClickHouse/ClickHouse/pull/58705) ([zhongyuankai](https://github.com/zhongyuankai)).
+* Modifying memory table settings through `ALTER MODIFY SETTING` is now supported. Example: `ALTER TABLE memory MODIFY SETTING min_rows_to_keep = 100, max_rows_to_keep = 1000;`. [#62039](https://github.com/ClickHouse/ClickHouse/pull/62039) ([zhongyuankai](https://github.com/zhongyuankai)).
+* Added `role` query parameter to the HTTP interface. It works similarly to `SET ROLE x`, applying the role before the statement is executed. This allows for overcoming the limitation of the HTTP interface, as multiple statements are not allowed, and it is not possible to send both `SET ROLE x` and the statement itself at the same time. It is possible to set multiple roles that way, e.g., `?role=x&role=y`, which will be an equivalent of `SET ROLE x, y`. [#62669](https://github.com/ClickHouse/ClickHouse/pull/62669) ([Serge Klochkov](https://github.com/slvrtrn)).
+* Add `SYSTEM UNLOAD PRIMARY KEY` to free up memory usage for a table's primary key. [#62738](https://github.com/ClickHouse/ClickHouse/pull/62738) ([Pablo Marcos](https://github.com/pamarcos)).
+* Added `value1`, `value2`, ..., `value10` columns to `system.text_log`. These columns contain values that were used to format the message. [#59619](https://github.com/ClickHouse/ClickHouse/pull/59619) ([Alexey Katsman](https://github.com/alexkats)).
+* Added persistent virtual column `_block_offset` which stores original number of row in block that was assigned at insert. Persistence of column `_block_offset` can be enabled by the MergeTree setting `enable_block_offset_column`. Added virtual column`_part_data_version` which contains either min block number or mutation version of part. Persistent virtual column `_block_number` is not considered experimental anymore. [#60676](https://github.com/ClickHouse/ClickHouse/pull/60676) ([Anton Popov](https://github.com/CurtizJ)).
+* Add `TRUNCATE ALL TABLES`. [#61862](https://github.com/ClickHouse/ClickHouse/pull/61862) ([豪肥肥](https://github.com/HowePa)).
+* Add a setting `input_format_json_throw_on_bad_escape_sequence`, disabling it allows saving bad escape sequences in JSON input formats. [#61889](https://github.com/ClickHouse/ClickHouse/pull/61889) ([Kruglov Pavel](https://github.com/Avogar)).
+
+#### Performance Improvement
+* JOIN filter push down improvements using equivalent sets. [#61216](https://github.com/ClickHouse/ClickHouse/pull/61216) ([Maksim Kita](https://github.com/kitaisreal)).
+* Convert OUTER JOIN to INNER JOIN optimization if the filter after JOIN always filters default values. Optimization can be controlled with setting `query_plan_convert_outer_join_to_inner_join`, enabled by default. [#62907](https://github.com/ClickHouse/ClickHouse/pull/62907) ([Maksim Kita](https://github.com/kitaisreal)).
+* Enabled fast Parquet encoder by default (output_format_parquet_use_custom_encoder). [#62088](https://github.com/ClickHouse/ClickHouse/pull/62088) ([Michael Kolupaev](https://github.com/al13n321)).
+* Improvement for AWS S3. Client has to send header 'Keep-Alive: timeout=X' to the server. If a client receives a response from the server with that header, client has to use the value from the server. Also for a client it is better not to use a connection which is nearly expired in order to avoid connection close race. [#62249](https://github.com/ClickHouse/ClickHouse/pull/62249) ([Sema Checherinda](https://github.com/CheSema)).
+* Reduce overhead of the mutations for SELECTs (v2). [#60856](https://github.com/ClickHouse/ClickHouse/pull/60856) ([Azat Khuzhin](https://github.com/azat)).
+* More frequently invoked functions in PODArray are now force-inlined. [#61144](https://github.com/ClickHouse/ClickHouse/pull/61144) ([李扬](https://github.com/taiyang-li)).
+* Speed up parsing of JSON by skipping the rest of the object when all required columns are read. [#62210](https://github.com/ClickHouse/ClickHouse/pull/62210) ([lgbo](https://github.com/lgbo-ustc)).
+* Improve trivial insert select from files in file/s3/hdfs/url/... table functions. Add separate max_parsing_threads setting to control the number of threads used in parallel parsing. [#62404](https://github.com/ClickHouse/ClickHouse/pull/62404) ([Kruglov Pavel](https://github.com/Avogar)).
+* Support parallel write buffer for Azure Blob Storage managed by setting `azure_allow_parallel_part_upload`. [#62534](https://github.com/ClickHouse/ClickHouse/pull/62534) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Functions `to_utc_timestamp` and `from_utc_timestamp` are now about 2x faster. [#62583](https://github.com/ClickHouse/ClickHouse/pull/62583) ([KevinyhZou](https://github.com/KevinyhZou)).
+* Functions `parseDateTimeOrNull`, `parseDateTimeOrZero`, `parseDateTimeInJodaSyntaxOrNull` and `parseDateTimeInJodaSyntaxOrZero` now run significantly faster (10x - 1000x) when the input contains mostly non-parseable values. [#62634](https://github.com/ClickHouse/ClickHouse/pull/62634) ([LiuNeng](https://github.com/liuneng1994)).
+* SELECTs against `system.query_cache` are now noticeably faster when the query cache contains lots of entries (e.g. more than 100.000). [#62671](https://github.com/ClickHouse/ClickHouse/pull/62671) ([Robert Schulze](https://github.com/rschu1ze)).
+* Less contention in filesystem cache (part 3): execute removal from filesystem without lock on space reservation attempt. [#61163](https://github.com/ClickHouse/ClickHouse/pull/61163) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Speed up dynamic resize of filesystem cache. [#61723](https://github.com/ClickHouse/ClickHouse/pull/61723) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Dictionary source with `INVALIDATE_QUERY` is not reloaded twice on startup. [#62050](https://github.com/ClickHouse/ClickHouse/pull/62050) ([vdimir](https://github.com/vdimir)).
+* Fix an issue where when a redundant `= 1` or `= 0` is added after a boolean expression involving the primary key, the primary index is not used. For example, both `SELECT * FROM <table> WHERE <primary-key> IN (<value>) = 1` and `SELECT * FROM <table> WHERE <primary-key> NOT IN (<value>) = 0` will both perform a full table scan, when the primary index can be used. [#62142](https://github.com/ClickHouse/ClickHouse/pull/62142) ([josh-hildred](https://github.com/josh-hildred)).
+* Return stream of chunks from `system.remote_data_paths` instead of accumulating the whole result in one big chunk. This allows to consume less memory, show intermediate progress and cancel the query. [#62613](https://github.com/ClickHouse/ClickHouse/pull/62613) ([Alexander Gololobov](https://github.com/davenger)).
+
+#### Experimental Feature
+* Userspace page cache works with static web storage (`disk(type = web)`) now. Use client setting `use_page_cache_for_disks_without_file_cache=1` to enable. [#61911](https://github.com/ClickHouse/ClickHouse/pull/61911) ([Michael Kolupaev](https://github.com/al13n321)).
+* Don't treat Bool and number variants as suspicious in the `Variant` type. [#61999](https://github.com/ClickHouse/ClickHouse/pull/61999) ([Kruglov Pavel](https://github.com/Avogar)).
+* Implement better conversion from String to `Variant` using parsing. [#62005](https://github.com/ClickHouse/ClickHouse/pull/62005) ([Kruglov Pavel](https://github.com/Avogar)).
+* Support `Variant` in JSONExtract functions. [#62014](https://github.com/ClickHouse/ClickHouse/pull/62014) ([Kruglov Pavel](https://github.com/Avogar)).
+* Mark type Variant as comparable so it can be used in primary key. [#62693](https://github.com/ClickHouse/ClickHouse/pull/62693) ([Kruglov Pavel](https://github.com/Avogar)).
+
+#### Improvement
+* For convenience purpose, `SELECT * FROM numbers() `will work in the same way as `SELECT * FROM system.numbers` - without a limit. [#61969](https://github.com/ClickHouse/ClickHouse/pull/61969) ([YenchangChan](https://github.com/YenchangChan)).
+* Introduce separate consumer/producer tags for the Kafka configuration. This avoids warnings from librdkafka (a bad C library with a lot of bugs) that consumer properties were specified for producer instances and vice versa (e.g. `Configuration property session.timeout.ms is a consumer property and will be ignored by this producer instance`). Closes: [#58983](https://github.com/ClickHouse/ClickHouse/issues/58983). [#58956](https://github.com/ClickHouse/ClickHouse/pull/58956) ([Aleksandr Musorin](https://github.com/AVMusorin)).
+* Functions `date_diff` and `age` now calculate their result at nanosecond instead of microsecond precision. They now also offer `nanosecond` (or `nanoseconds` or `ns`) as a possible value for the `unit` parameter. [#61409](https://github.com/ClickHouse/ClickHouse/pull/61409) ([Austin Kothig](https://github.com/kothiga)).
+* Added nano- micro- milliseconds unit for `date_trunc`. [#62335](https://github.com/ClickHouse/ClickHouse/pull/62335) ([Misz606](https://github.com/Misz606)).
+* Reload certificate chain during certificate reload. [#61671](https://github.com/ClickHouse/ClickHouse/pull/61671) ([Pervakov Grigorii](https://github.com/GrigoryPervakov)).
+* Try to prevent an error [#60432](https://github.com/ClickHouse/ClickHouse/issues/60432) by not allowing a table to be attached if there is an active replica for that replica path. [#61876](https://github.com/ClickHouse/ClickHouse/pull/61876) ([Arthur Passos](https://github.com/arthurpassos)).
+* Implement support for `input` for `clickhouse-local`. [#61923](https://github.com/ClickHouse/ClickHouse/pull/61923) ([Azat Khuzhin](https://github.com/azat)).
+* `Join` table engine with strictness `ANY` is consistent after reload. When several rows with the same key are inserted, the first one will have higher priority (before, it was chosen randomly upon table loading). close [#51027](https://github.com/ClickHouse/ClickHouse/issues/51027). [#61972](https://github.com/ClickHouse/ClickHouse/pull/61972) ([vdimir](https://github.com/vdimir)).
+* Automatically infer Nullable column types from Apache Arrow schema. [#61984](https://github.com/ClickHouse/ClickHouse/pull/61984) ([Maksim Kita](https://github.com/kitaisreal)).
+* Allow to cancel parallel merge of aggregate states during aggregation. Example: `uniqExact`. [#61992](https://github.com/ClickHouse/ClickHouse/pull/61992) ([Maksim Kita](https://github.com/kitaisreal)).
+* Use `system.keywords` to fill in the suggestions and also use them in the all places internally. [#62000](https://github.com/ClickHouse/ClickHouse/pull/62000) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* `OPTIMIZE FINAL` for `ReplicatedMergeTree` now will wait for currently active merges to finish and then reattempt to schedule a final merge. This will put it more in line with ordinary `MergeTree` behaviour. [#62067](https://github.com/ClickHouse/ClickHouse/pull/62067) ([Nikita Taranov](https://github.com/nickitat)).
+* While read data from a hive text file, it would use the first line of hive text file to resize of number of input fields, and sometimes the fields number of first line is not matched with the hive table defined , such as the hive table is defined to have 3 columns, like `test_tbl(a Int32, b Int32, c Int32)`, but the first line of text file only has 2 fields, and in this suitation, the input fields will be resized to 2, and if the next line of the text file has 3 fields, then the third field can not be read but set a default value 0, which is not right. [#62086](https://github.com/ClickHouse/ClickHouse/pull/62086) ([KevinyhZou](https://github.com/KevinyhZou)).
+* CREATE AS copies the table's comment. [#62117](https://github.com/ClickHouse/ClickHouse/pull/62117) ([Pablo Marcos](https://github.com/pamarcos)).
+* Add query progress to table zookeeper. [#62152](https://github.com/ClickHouse/ClickHouse/pull/62152) ([JackyWoo](https://github.com/JackyWoo)).
+* Add ability to turn on trace collector (Real and CPU) server-wide. [#62189](https://github.com/ClickHouse/ClickHouse/pull/62189) ([alesapin](https://github.com/alesapin)).
+* Added setting `lightweight_deletes_sync` (default value: 2 - wait all replicas synchronously). It is similar to setting `mutations_sync` but affects only behaviour of lightweight deletes. [#62195](https://github.com/ClickHouse/ClickHouse/pull/62195) ([Anton Popov](https://github.com/CurtizJ)).
+* Distinguish booleans and integers while parsing values for custom settings: `SET custom_a = true; SET custom_b = 1;`. [#62206](https://github.com/ClickHouse/ClickHouse/pull/62206) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Support S3 access through AWS Private Link Interface endpoints. Closes [#60021](https://github.com/ClickHouse/ClickHouse/issues/60021), [#31074](https://github.com/ClickHouse/ClickHouse/issues/31074) and [#53761](https://github.com/ClickHouse/ClickHouse/issues/53761). [#62208](https://github.com/ClickHouse/ClickHouse/pull/62208) ([Arthur Passos](https://github.com/arthurpassos)).
+* Do not create a directory for UDF in clickhouse-client if it does not exist. This closes [#59597](https://github.com/ClickHouse/ClickHouse/issues/59597). [#62366](https://github.com/ClickHouse/ClickHouse/pull/62366) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* The query cache now no longer caches results of queries against system tables (`system.*`, `information_schema.*`, `INFORMATION_SCHEMA.*`). [#62376](https://github.com/ClickHouse/ClickHouse/pull/62376) ([Robert Schulze](https://github.com/rschu1ze)).
+* `MOVE PARTITION TO TABLE` query can be delayed or can throw `TOO_MANY_PARTS` exception to avoid exceeding limits on the part count. The same settings and limits are applied as for the`INSERT` query (see `max_parts_in_total`, `parts_to_delay_insert`, `parts_to_throw_insert`, `inactive_parts_to_throw_insert`, `inactive_parts_to_delay_insert`, `max_avg_part_size_for_too_many_parts`, `min_delay_to_insert_ms` and `max_delay_to_insert` settings). [#62420](https://github.com/ClickHouse/ClickHouse/pull/62420) ([Sergei Trifonov](https://github.com/serxa)).
+* Changed the default installation directory on macOS from `/usr/bin` to `/usr/local/bin`. This is necessary because Apple's System Integrity Protection introduced with macOS El Capitan (2015) prevents writing into `/usr/bin`, even with `sudo`. [#62489](https://github.com/ClickHouse/ClickHouse/pull/62489) ([haohang](https://github.com/yokofly)).
+* Make transform always return the first match. [#62518](https://github.com/ClickHouse/ClickHouse/pull/62518) ([Raúl Marín](https://github.com/Algunenano)).
+* Added the missing `hostname` column to system table `blob_storage_log`. [#62456](https://github.com/ClickHouse/ClickHouse/pull/62456) ([Jayme Bird](https://github.com/jaymebrd)).
+* For consistency with other system tables, `system.backup_log` now has a column `event_time`. [#62541](https://github.com/ClickHouse/ClickHouse/pull/62541) ([Jayme Bird](https://github.com/jaymebrd)).
+* Table `system.backup_log` now has the "default" sorting key which is `event_date, event_time`, the same as for other `_log` table engines. [#62667](https://github.com/ClickHouse/ClickHouse/pull/62667) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Avoid evaluating table DEFAULT expressions while executing `RESTORE`. [#62601](https://github.com/ClickHouse/ClickHouse/pull/62601) ([Vitaly Baranov](https://github.com/vitlibar)).
+* S3 storage and backups also need the same default keep alive settings as s3 disk. [#62648](https://github.com/ClickHouse/ClickHouse/pull/62648) ([Sema Checherinda](https://github.com/CheSema)).
+* Add librdkafka's (that infamous C library, which has a lot of bugs) client identifier to log messages to be able to differentiate log messages from different consumers of a single table. [#62813](https://github.com/ClickHouse/ClickHouse/pull/62813) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Allow special macros `{uuid}` and `{database}` in a Replicated database ZooKeeper path. [#62818](https://github.com/ClickHouse/ClickHouse/pull/62818) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Allow quota key with different auth scheme in HTTP requests. [#62842](https://github.com/ClickHouse/ClickHouse/pull/62842) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Reduce the verbosity of command line argument `--help` in `clickhouse client` and `clickhouse local`. The previous output is now generated by `--help --verbose`. [#62973](https://github.com/ClickHouse/ClickHouse/pull/62973) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+* `log_bin_use_v1_row_events` was removed in MySQL 8.3, and we adjust the experimental `MaterializedMySQL` engine for it [#60479](https://github.com/ClickHouse/ClickHouse/issues/60479). [#63101](https://github.com/ClickHouse/ClickHouse/pull/63101) ([Eugene Klimov](https://github.com/Slach)). Author: Nikolay Yankin.
+
+#### Build/Testing/Packaging Improvement
+* Vendor in Rust dependencies, so the Rust code (that we use for minor features for hype and lulz) can be built in a sane way, similarly to C++. [#62297](https://github.com/ClickHouse/ClickHouse/pull/62297) ([Raúl Marín](https://github.com/Algunenano)).
+* ClickHouse now uses OpenSSL 3.2 instead of BoringSSL. [#59870](https://github.com/ClickHouse/ClickHouse/pull/59870) ([Robert Schulze](https://github.com/rschu1ze)). Note that OpenSSL has generally worse engineering culture (such as non-zero number of sanitizer reports, that we had to patch, a complex build system with generated files, etc.) but has better compatibility.
+* Ignore DROP queries in stress test with 1/2 probability, use TRUNCATE instead of ignoring DROP in upgrade check for Memory/JOIN tables. [#61476](https://github.com/ClickHouse/ClickHouse/pull/61476) ([Kruglov Pavel](https://github.com/Avogar)).
+* Remove from the Keeper Docker image the volumes at /etc/clickhouse-keeper and /var/log/clickhouse-keeper. [#61683](https://github.com/ClickHouse/ClickHouse/pull/61683) ([Tristan](https://github.com/Tristan971)).
+* Add tests for all issues which are no longer relevant with Analyzer being enabled by default. Closes: [#55794](https://github.com/ClickHouse/ClickHouse/issues/55794) Closes: [#49472](https://github.com/ClickHouse/ClickHouse/issues/49472) Closes: [#44414](https://github.com/ClickHouse/ClickHouse/issues/44414) Closes: [#13843](https://github.com/ClickHouse/ClickHouse/issues/13843) Closes: [#55803](https://github.com/ClickHouse/ClickHouse/issues/55803) Closes: [#48308](https://github.com/ClickHouse/ClickHouse/issues/48308) Closes: [#45535](https://github.com/ClickHouse/ClickHouse/issues/45535) Closes: [#44365](https://github.com/ClickHouse/ClickHouse/issues/44365) Closes: [#44153](https://github.com/ClickHouse/ClickHouse/issues/44153) Closes: [#42399](https://github.com/ClickHouse/ClickHouse/issues/42399) Closes: [#27115](https://github.com/ClickHouse/ClickHouse/issues/27115) Closes: [#23162](https://github.com/ClickHouse/ClickHouse/issues/23162) Closes: [#15395](https://github.com/ClickHouse/ClickHouse/issues/15395) Closes: [#15411](https://github.com/ClickHouse/ClickHouse/issues/15411) Closes: [#14978](https://github.com/ClickHouse/ClickHouse/issues/14978) Closes: [#17319](https://github.com/ClickHouse/ClickHouse/issues/17319) Closes: [#11813](https://github.com/ClickHouse/ClickHouse/issues/11813) Closes: [#13210](https://github.com/ClickHouse/ClickHouse/issues/13210) Closes: [#23053](https://github.com/ClickHouse/ClickHouse/issues/23053) Closes: [#37729](https://github.com/ClickHouse/ClickHouse/issues/37729) Closes: [#32639](https://github.com/ClickHouse/ClickHouse/issues/32639) Closes: [#9954](https://github.com/ClickHouse/ClickHouse/issues/9954) Closes: [#41964](https://github.com/ClickHouse/ClickHouse/issues/41964) Closes: [#54317](https://github.com/ClickHouse/ClickHouse/issues/54317) Closes: [#7520](https://github.com/ClickHouse/ClickHouse/issues/7520) Closes: [#36973](https://github.com/ClickHouse/ClickHouse/issues/36973) Closes: [#40955](https://github.com/ClickHouse/ClickHouse/issues/40955) Closes: [#19687](https://github.com/ClickHouse/ClickHouse/issues/19687) Closes: [#23104](https://github.com/ClickHouse/ClickHouse/issues/23104) Closes: [#21584](https://github.com/ClickHouse/ClickHouse/issues/21584) Closes: [#23344](https://github.com/ClickHouse/ClickHouse/issues/23344) Closes: [#22627](https://github.com/ClickHouse/ClickHouse/issues/22627) Closes: [#10276](https://github.com/ClickHouse/ClickHouse/issues/10276) Closes: [#19687](https://github.com/ClickHouse/ClickHouse/issues/19687) Closes: [#4567](https://github.com/ClickHouse/ClickHouse/issues/4567) Closes: [#17710](https://github.com/ClickHouse/ClickHouse/issues/17710) Closes: [#11068](https://github.com/ClickHouse/ClickHouse/issues/11068) Closes: [#24395](https://github.com/ClickHouse/ClickHouse/issues/24395) Closes: [#23416](https://github.com/ClickHouse/ClickHouse/issues/23416) Closes: [#23162](https://github.com/ClickHouse/ClickHouse/issues/23162) Closes: [#25655](https://github.com/ClickHouse/ClickHouse/issues/25655) Closes: [#11757](https://github.com/ClickHouse/ClickHouse/issues/11757) Closes: [#6571](https://github.com/ClickHouse/ClickHouse/issues/6571) Closes: [#4432](https://github.com/ClickHouse/ClickHouse/issues/4432) Closes: [#8259](https://github.com/ClickHouse/ClickHouse/issues/8259) Closes: [#9233](https://github.com/ClickHouse/ClickHouse/issues/9233) Closes: [#14699](https://github.com/ClickHouse/ClickHouse/issues/14699) Closes: [#27068](https://github.com/ClickHouse/ClickHouse/issues/27068) Closes: [#28687](https://github.com/ClickHouse/ClickHouse/issues/28687) Closes: [#28777](https://github.com/ClickHouse/ClickHouse/issues/28777) Closes: [#29734](https://github.com/ClickHouse/ClickHouse/issues/29734) Closes: [#61238](https://github.com/ClickHouse/ClickHouse/issues/61238) Closes: [#33825](https://github.com/ClickHouse/ClickHouse/issues/33825) Closes: [#35608](https://github.com/ClickHouse/ClickHouse/issues/35608) Closes: [#29838](https://github.com/ClickHouse/ClickHouse/issues/29838) Closes: [#35652](https://github.com/ClickHouse/ClickHouse/issues/35652) Closes: [#36189](https://github.com/ClickHouse/ClickHouse/issues/36189) Closes: [#39634](https://github.com/ClickHouse/ClickHouse/issues/39634) Closes: [#47432](https://github.com/ClickHouse/ClickHouse/issues/47432) Closes: [#54910](https://github.com/ClickHouse/ClickHouse/issues/54910) Closes: [#57321](https://github.com/ClickHouse/ClickHouse/issues/57321) Closes: [#59154](https://github.com/ClickHouse/ClickHouse/issues/59154) Closes: [#61014](https://github.com/ClickHouse/ClickHouse/issues/61014) Closes: [#61950](https://github.com/ClickHouse/ClickHouse/issues/61950) Closes: [#55647](https://github.com/ClickHouse/ClickHouse/issues/55647) Closes: [#61947](https://github.com/ClickHouse/ClickHouse/issues/61947). [#62185](https://github.com/ClickHouse/ClickHouse/pull/62185) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add more tests from issues which are no longer relevant or fixed by analyzer. Closes: [#58985](https://github.com/ClickHouse/ClickHouse/issues/58985) Closes: [#59549](https://github.com/ClickHouse/ClickHouse/issues/59549) Closes: [#36963](https://github.com/ClickHouse/ClickHouse/issues/36963) Closes: [#39453](https://github.com/ClickHouse/ClickHouse/issues/39453) Closes: [#56521](https://github.com/ClickHouse/ClickHouse/issues/56521) Closes: [#47552](https://github.com/ClickHouse/ClickHouse/issues/47552) Closes: [#56503](https://github.com/ClickHouse/ClickHouse/issues/56503) Closes: [#59101](https://github.com/ClickHouse/ClickHouse/issues/59101) Closes: [#50271](https://github.com/ClickHouse/ClickHouse/issues/50271) Closes: [#54954](https://github.com/ClickHouse/ClickHouse/issues/54954) Closes: [#56466](https://github.com/ClickHouse/ClickHouse/issues/56466) Closes: [#11000](https://github.com/ClickHouse/ClickHouse/issues/11000) Closes: [#10894](https://github.com/ClickHouse/ClickHouse/issues/10894) Closes: https://github.com/ClickHouse/ClickHouse/issues/448 Closes: [#8030](https://github.com/ClickHouse/ClickHouse/issues/8030) Closes: [#32139](https://github.com/ClickHouse/ClickHouse/issues/32139) Closes: [#47288](https://github.com/ClickHouse/ClickHouse/issues/47288) Closes: [#50705](https://github.com/ClickHouse/ClickHouse/issues/50705) Closes: [#54511](https://github.com/ClickHouse/ClickHouse/issues/54511) Closes: [#55466](https://github.com/ClickHouse/ClickHouse/issues/55466) Closes: [#58500](https://github.com/ClickHouse/ClickHouse/issues/58500) Closes: [#39923](https://github.com/ClickHouse/ClickHouse/issues/39923) Closes: [#39855](https://github.com/ClickHouse/ClickHouse/issues/39855) Closes: [#4596](https://github.com/ClickHouse/ClickHouse/issues/4596) Closes: [#47422](https://github.com/ClickHouse/ClickHouse/issues/47422) Closes: [#33000](https://github.com/ClickHouse/ClickHouse/issues/33000) Closes: [#14739](https://github.com/ClickHouse/ClickHouse/issues/14739) Closes: [#44039](https://github.com/ClickHouse/ClickHouse/issues/44039) Closes: [#8547](https://github.com/ClickHouse/ClickHouse/issues/8547) Closes: [#22923](https://github.com/ClickHouse/ClickHouse/issues/22923) Closes: [#23865](https://github.com/ClickHouse/ClickHouse/issues/23865) Closes: [#29748](https://github.com/ClickHouse/ClickHouse/issues/29748) Closes: [#4222](https://github.com/ClickHouse/ClickHouse/issues/4222). [#62457](https://github.com/ClickHouse/ClickHouse/pull/62457) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Fixed build errors when OpenSSL is linked dynamically (note: this is generally unsupported and only required for IBM's s390x platforms). [#62888](https://github.com/ClickHouse/ClickHouse/pull/62888) ([Harry Lee](https://github.com/HarryLeeIBM)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+* Fix logical-error when undoing quorum insert transaction. [#61953](https://github.com/ClickHouse/ClickHouse/pull/61953) ([Han Fei](https://github.com/hanfei1991)).
+* Fix parser error when using COUNT(*) with FILTER clause [#61357](https://github.com/ClickHouse/ClickHouse/pull/61357) ([Duc Canh Le](https://github.com/canhld94)).
+* Fix logical error in `group_by_use_nulls` + grouping sets + analyzer + materialize/constant [#61567](https://github.com/ClickHouse/ClickHouse/pull/61567) ([Kruglov Pavel](https://github.com/Avogar)).
+* Cancel merges before removing moved parts [#61610](https://github.com/ClickHouse/ClickHouse/pull/61610) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Fix abort in Apache Arrow [#61720](https://github.com/ClickHouse/ClickHouse/pull/61720) ([Kruglov Pavel](https://github.com/Avogar)).
+* Search for `convert_to_replicated` flag at the correct path corresponding to the specific disk [#61769](https://github.com/ClickHouse/ClickHouse/pull/61769) ([Kirill](https://github.com/kirillgarbar)).
+* Fix possible connections data-race for distributed_foreground_insert/distributed_background_insert_batch [#61867](https://github.com/ClickHouse/ClickHouse/pull/61867) ([Azat Khuzhin](https://github.com/azat)).
+* Mark CANNOT_PARSE_ESCAPE_SEQUENCE error as parse error to be able to skip it in row input formats [#61883](https://github.com/ClickHouse/ClickHouse/pull/61883) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix writing exception message in output format in HTTP when http_wait_end_of_query is used [#61951](https://github.com/ClickHouse/ClickHouse/pull/61951) ([Kruglov Pavel](https://github.com/Avogar)).
+* Proper fix for LowCardinality together with JSONExtact functions [#61957](https://github.com/ClickHouse/ClickHouse/pull/61957) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Crash in Engine Merge if Row Policy does not have expression [#61971](https://github.com/ClickHouse/ClickHouse/pull/61971) ([Ilya Golshtein](https://github.com/ilejn)).
+* Fix WriteBufferAzureBlobStorage destructor uncaught exception [#61988](https://github.com/ClickHouse/ClickHouse/pull/61988) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Fix CREATE TABLE without columns definition for ReplicatedMergeTree [#62040](https://github.com/ClickHouse/ClickHouse/pull/62040) ([Azat Khuzhin](https://github.com/azat)).
+* Fix optimize_skip_unused_shards_rewrite_in for composite sharding key [#62047](https://github.com/ClickHouse/ClickHouse/pull/62047) ([Azat Khuzhin](https://github.com/azat)).
+* ReadWriteBufferFromHTTP set right header host when redirected [#62068](https://github.com/ClickHouse/ClickHouse/pull/62068) ([Sema Checherinda](https://github.com/CheSema)).
+* Fix external table cannot parse data type Bool [#62115](https://github.com/ClickHouse/ClickHouse/pull/62115) ([Duc Canh Le](https://github.com/canhld94)).
+* Analyzer: Fix query parameter resolution [#62186](https://github.com/ClickHouse/ClickHouse/pull/62186) ([Dmitry Novik](https://github.com/novikd)).
+* Fix restoring parts while readonly [#62207](https://github.com/ClickHouse/ClickHouse/pull/62207) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix crash in index definition containing SQL UDF [#62225](https://github.com/ClickHouse/ClickHouse/pull/62225) ([vdimir](https://github.com/vdimir)).
+* Fixing NULL random seed for generateRandom with analyzer. [#62248](https://github.com/ClickHouse/ClickHouse/pull/62248) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Correctly handle const columns in Distinct Transfom [#62250](https://github.com/ClickHouse/ClickHouse/pull/62250) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix Parts Splitter for queries with the FINAL modifier [#62268](https://github.com/ClickHouse/ClickHouse/pull/62268) ([Nikita Taranov](https://github.com/nickitat)).
+* Analyzer: Fix alias to parametrized view resolution [#62274](https://github.com/ClickHouse/ClickHouse/pull/62274) ([Dmitry Novik](https://github.com/novikd)).
+* Analyzer: Fix name resolution from parent scopes [#62281](https://github.com/ClickHouse/ClickHouse/pull/62281) ([Dmitry Novik](https://github.com/novikd)).
+* Fix argMax with nullable non native numeric column [#62285](https://github.com/ClickHouse/ClickHouse/pull/62285) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix BACKUP and RESTORE of a materialized view in Ordinary database [#62295](https://github.com/ClickHouse/ClickHouse/pull/62295) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix data race on scalars in Context [#62305](https://github.com/ClickHouse/ClickHouse/pull/62305) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix primary key in materialized view [#62319](https://github.com/ClickHouse/ClickHouse/pull/62319) ([Murat Khairulin](https://github.com/mxwell)).
+* Do not build multithread insert pipeline for tables without support [#62333](https://github.com/ClickHouse/ClickHouse/pull/62333) ([vdimir](https://github.com/vdimir)).
+* Fix analyzer with positional arguments in distributed query [#62362](https://github.com/ClickHouse/ClickHouse/pull/62362) ([flynn](https://github.com/ucasfl)).
+* Fix filter pushdown from additional_table_filters in Merge engine in analyzer [#62398](https://github.com/ClickHouse/ClickHouse/pull/62398) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix GLOBAL IN table queries with analyzer. [#62409](https://github.com/ClickHouse/ClickHouse/pull/62409) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Respect settings truncate_on_insert/create_new_file_on_insert in s3/hdfs/azure engines during partitioned write [#62425](https://github.com/ClickHouse/ClickHouse/pull/62425) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix backup restore path for AzureBlobStorage [#62447](https://github.com/ClickHouse/ClickHouse/pull/62447) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Fix SimpleSquashingChunksTransform [#62451](https://github.com/ClickHouse/ClickHouse/pull/62451) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix capture of nested lambda. [#62462](https://github.com/ClickHouse/ClickHouse/pull/62462) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Avoid crash when reading protobuf with recursive types [#62506](https://github.com/ClickHouse/ClickHouse/pull/62506) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix a bug moving one partition from one to itself [#62524](https://github.com/ClickHouse/ClickHouse/pull/62524) ([helifu](https://github.com/helifu)).
+* Fix scalar subquery in LIMIT [#62567](https://github.com/ClickHouse/ClickHouse/pull/62567) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix segfault in the experimental and unsupported Hive engine, which we don't like anyway [#62578](https://github.com/ClickHouse/ClickHouse/pull/62578) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix memory leak in groupArraySorted [#62597](https://github.com/ClickHouse/ClickHouse/pull/62597) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix crash in largestTriangleThreeBuckets [#62646](https://github.com/ClickHouse/ClickHouse/pull/62646) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix tumble\[Start,End\] and hop\[Start,End\] for bigger resolutions [#62705](https://github.com/ClickHouse/ClickHouse/pull/62705) ([Jordi Villar](https://github.com/jrdi)).
+* Fix argMin/argMax combinator state [#62708](https://github.com/ClickHouse/ClickHouse/pull/62708) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix temporary data in cache failing because of cache lock contention optimization [#62715](https://github.com/ClickHouse/ClickHouse/pull/62715) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix crash in function `mergeTreeIndex` [#62762](https://github.com/ClickHouse/ClickHouse/pull/62762) ([Anton Popov](https://github.com/CurtizJ)).
+* fix: update: nested materialized columns: size check fixes [#62773](https://github.com/ClickHouse/ClickHouse/pull/62773) ([Eliot Hautefeuille](https://github.com/hileef)).
+* Fix FINAL modifier is not respected in CTE with analyzer [#62811](https://github.com/ClickHouse/ClickHouse/pull/62811) ([Duc Canh Le](https://github.com/canhld94)).
+* Fix crash in function `formatRow` with `JSON` format and HTTP  interface [#62840](https://github.com/ClickHouse/ClickHouse/pull/62840) ([Anton Popov](https://github.com/CurtizJ)).
+* Azure: fix building final url from endpoint object [#62850](https://github.com/ClickHouse/ClickHouse/pull/62850) ([Daniel Pozo Escalona](https://github.com/danipozo)).
+* Fix GCD codec [#62853](https://github.com/ClickHouse/ClickHouse/pull/62853) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix LowCardinality(Nullable) key in hyperrectangle [#62866](https://github.com/ClickHouse/ClickHouse/pull/62866) ([Amos Bird](https://github.com/amosbird)).
+* Fix fromUnixtimestamp in joda syntax while the input value beyond UInt32 [#62901](https://github.com/ClickHouse/ClickHouse/pull/62901) ([KevinyhZou](https://github.com/KevinyhZou)).
+* Disable optimize_rewrite_aggregate_function_with_if for sum(nullable) [#62912](https://github.com/ClickHouse/ClickHouse/pull/62912) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix PREWHERE for StorageBuffer with different source table column types. [#62916](https://github.com/ClickHouse/ClickHouse/pull/62916) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix temporary data in cache incorrectly processing failure of cache key directory creation [#62925](https://github.com/ClickHouse/ClickHouse/pull/62925) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* gRPC: fix crash on IPv6 peer connection [#62978](https://github.com/ClickHouse/ClickHouse/pull/62978) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Fix possible CHECKSUM_DOESNT_MATCH (and others) during replicated fetches [#62987](https://github.com/ClickHouse/ClickHouse/pull/62987) ([Azat Khuzhin](https://github.com/azat)).
+* Fix terminate with uncaught exception in temporary data in cache [#62998](https://github.com/ClickHouse/ClickHouse/pull/62998) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix optimize_rewrite_aggregate_function_with_if implicit cast [#62999](https://github.com/ClickHouse/ClickHouse/pull/62999) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix unhandled exception in ~RestorerFromBackup [#63040](https://github.com/ClickHouse/ClickHouse/pull/63040) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Do not remove server constants from GROUP BY key for secondary query. [#63047](https://github.com/ClickHouse/ClickHouse/pull/63047) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix incorrect judgement of of monotonicity of function abs [#63097](https://github.com/ClickHouse/ClickHouse/pull/63097) ([Duc Canh Le](https://github.com/canhld94)).
+* Set server name for SSL handshake in MongoDB engine [#63122](https://github.com/ClickHouse/ClickHouse/pull/63122) ([Alexander Gololobov](https://github.com/davenger)).
+* Use user specified db instead of "config" for MongoDB wire protocol version check [#63126](https://github.com/ClickHouse/ClickHouse/pull/63126) ([Alexander Gololobov](https://github.com/davenger)).
+
+
 ### <a id="243"></a> ClickHouse release 24.3 LTS, 2024-03-27
 
 #### Upgrade Notes

From 10f7125fa6babe34e73dd6530f00821b694bdc3b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Tue, 30 Apr 2024 09:07:32 +0300
Subject: [PATCH 130/192] Revert "Enable custom parquet encoder by default"

---
 src/Core/Settings.h               | 2 +-
 src/Core/SettingsChangesHistory.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index f332b51bf1d..678901d5ec8 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -1122,7 +1122,7 @@ class IColumn;
     M(ParquetVersion, output_format_parquet_version, "2.latest", "Parquet format version for output format. Supported versions: 1.0, 2.4, 2.6 and 2.latest (default)", 0) \
     M(ParquetCompression, output_format_parquet_compression_method, "zstd", "Compression method for Parquet output format. Supported codecs: snappy, lz4, brotli, zstd, gzip, none (uncompressed)", 0) \
     M(Bool, output_format_parquet_compliant_nested_types, true, "In parquet file schema, use name 'element' instead of 'item' for list elements. This is a historical artifact of Arrow library implementation. Generally increases compatibility, except perhaps with some old versions of Arrow.", 0) \
-    M(Bool, output_format_parquet_use_custom_encoder, true, "Use a faster Parquet encoder implementation.", 0) \
+    M(Bool, output_format_parquet_use_custom_encoder, false, "Use a faster Parquet encoder implementation.", 0) \
     M(Bool, output_format_parquet_parallel_encoding, true, "Do Parquet encoding in multiple threads. Requires output_format_parquet_use_custom_encoder.", 0) \
     M(UInt64, output_format_parquet_data_page_size, 1024 * 1024, "Target page size in bytes, before compression.", 0) \
     M(UInt64, output_format_parquet_batch_size, 1024, "Check page size every this many rows. Consider decreasing if you have columns with average values size above a few KBs.", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index da6e4e11031..09495897e28 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -138,7 +138,6 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."},
               {"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."},
               {"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."},
-              {"output_format_parquet_use_custom_encoder", false, true, "Enable custom Parquet encoder."},
               }},
     {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"},
               {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"},

From eeea55d304759a14325a86c070601408149b749f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Tue, 30 Apr 2024 08:36:59 +0200
Subject: [PATCH 131/192] Update changelog

---
 CHANGELOG.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 629d573f8ab..28184cbfb5f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@
 
 # 2024 Changelog
 
-### ### <a id="244"></a> ClickHouse release 24.4 LTS, 2024-04-30
+### <a id="244"></a> ClickHouse release 24.4 LTS, 2024-04-30
 
 #### Upgrade Notes
 * `clickhouse-odbc-bridge` and `clickhouse-library-bridge` are now separate packages. This closes [#61677](https://github.com/ClickHouse/ClickHouse/issues/61677). [#62114](https://github.com/ClickHouse/ClickHouse/pull/62114) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
@@ -40,7 +40,6 @@
 * More frequently invoked functions in PODArray are now force-inlined. [#61144](https://github.com/ClickHouse/ClickHouse/pull/61144) ([李扬](https://github.com/taiyang-li)).
 * Speed up parsing of JSON by skipping the rest of the object when all required columns are read. [#62210](https://github.com/ClickHouse/ClickHouse/pull/62210) ([lgbo](https://github.com/lgbo-ustc)).
 * Improve trivial insert select from files in file/s3/hdfs/url/... table functions. Add separate max_parsing_threads setting to control the number of threads used in parallel parsing. [#62404](https://github.com/ClickHouse/ClickHouse/pull/62404) ([Kruglov Pavel](https://github.com/Avogar)).
-* Support parallel write buffer for Azure Blob Storage managed by setting `azure_allow_parallel_part_upload`. [#62534](https://github.com/ClickHouse/ClickHouse/pull/62534) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
 * Functions `to_utc_timestamp` and `from_utc_timestamp` are now about 2x faster. [#62583](https://github.com/ClickHouse/ClickHouse/pull/62583) ([KevinyhZou](https://github.com/KevinyhZou)).
 * Functions `parseDateTimeOrNull`, `parseDateTimeOrZero`, `parseDateTimeInJodaSyntaxOrNull` and `parseDateTimeInJodaSyntaxOrZero` now run significantly faster (10x - 1000x) when the input contains mostly non-parseable values. [#62634](https://github.com/ClickHouse/ClickHouse/pull/62634) ([LiuNeng](https://github.com/liuneng1994)).
 * SELECTs against `system.query_cache` are now noticeably faster when the query cache contains lots of entries (e.g. more than 100.000). [#62671](https://github.com/ClickHouse/ClickHouse/pull/62671) ([Robert Schulze](https://github.com/rschu1ze)).
@@ -51,17 +50,18 @@
 * Return stream of chunks from `system.remote_data_paths` instead of accumulating the whole result in one big chunk. This allows to consume less memory, show intermediate progress and cancel the query. [#62613](https://github.com/ClickHouse/ClickHouse/pull/62613) ([Alexander Gololobov](https://github.com/davenger)).
 
 #### Experimental Feature
+* Support parallel write buffer for Azure Blob Storage managed by setting `azure_allow_parallel_part_upload`. [#62534](https://github.com/ClickHouse/ClickHouse/pull/62534) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
 * Userspace page cache works with static web storage (`disk(type = web)`) now. Use client setting `use_page_cache_for_disks_without_file_cache=1` to enable. [#61911](https://github.com/ClickHouse/ClickHouse/pull/61911) ([Michael Kolupaev](https://github.com/al13n321)).
 * Don't treat Bool and number variants as suspicious in the `Variant` type. [#61999](https://github.com/ClickHouse/ClickHouse/pull/61999) ([Kruglov Pavel](https://github.com/Avogar)).
 * Implement better conversion from String to `Variant` using parsing. [#62005](https://github.com/ClickHouse/ClickHouse/pull/62005) ([Kruglov Pavel](https://github.com/Avogar)).
 * Support `Variant` in JSONExtract functions. [#62014](https://github.com/ClickHouse/ClickHouse/pull/62014) ([Kruglov Pavel](https://github.com/Avogar)).
-* Mark type Variant as comparable so it can be used in primary key. [#62693](https://github.com/ClickHouse/ClickHouse/pull/62693) ([Kruglov Pavel](https://github.com/Avogar)).
+* Mark type `Variant` as comparable so it can be used in primary key. [#62693](https://github.com/ClickHouse/ClickHouse/pull/62693) ([Kruglov Pavel](https://github.com/Avogar)).
 
 #### Improvement
 * For convenience purpose, `SELECT * FROM numbers() `will work in the same way as `SELECT * FROM system.numbers` - without a limit. [#61969](https://github.com/ClickHouse/ClickHouse/pull/61969) ([YenchangChan](https://github.com/YenchangChan)).
 * Introduce separate consumer/producer tags for the Kafka configuration. This avoids warnings from librdkafka (a bad C library with a lot of bugs) that consumer properties were specified for producer instances and vice versa (e.g. `Configuration property session.timeout.ms is a consumer property and will be ignored by this producer instance`). Closes: [#58983](https://github.com/ClickHouse/ClickHouse/issues/58983). [#58956](https://github.com/ClickHouse/ClickHouse/pull/58956) ([Aleksandr Musorin](https://github.com/AVMusorin)).
 * Functions `date_diff` and `age` now calculate their result at nanosecond instead of microsecond precision. They now also offer `nanosecond` (or `nanoseconds` or `ns`) as a possible value for the `unit` parameter. [#61409](https://github.com/ClickHouse/ClickHouse/pull/61409) ([Austin Kothig](https://github.com/kothiga)).
-* Added nano- micro- milliseconds unit for `date_trunc`. [#62335](https://github.com/ClickHouse/ClickHouse/pull/62335) ([Misz606](https://github.com/Misz606)).
+* Added nano-, micro-, milliseconds unit for `date_trunc`. [#62335](https://github.com/ClickHouse/ClickHouse/pull/62335) ([Misz606](https://github.com/Misz606)).
 * Reload certificate chain during certificate reload. [#61671](https://github.com/ClickHouse/ClickHouse/pull/61671) ([Pervakov Grigorii](https://github.com/GrigoryPervakov)).
 * Try to prevent an error [#60432](https://github.com/ClickHouse/ClickHouse/issues/60432) by not allowing a table to be attached if there is an active replica for that replica path. [#61876](https://github.com/ClickHouse/ClickHouse/pull/61876) ([Arthur Passos](https://github.com/arthurpassos)).
 * Implement support for `input` for `clickhouse-local`. [#61923](https://github.com/ClickHouse/ClickHouse/pull/61923) ([Azat Khuzhin](https://github.com/azat)).
@@ -71,7 +71,7 @@
 * Use `system.keywords` to fill in the suggestions and also use them in the all places internally. [#62000](https://github.com/ClickHouse/ClickHouse/pull/62000) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
 * `OPTIMIZE FINAL` for `ReplicatedMergeTree` now will wait for currently active merges to finish and then reattempt to schedule a final merge. This will put it more in line with ordinary `MergeTree` behaviour. [#62067](https://github.com/ClickHouse/ClickHouse/pull/62067) ([Nikita Taranov](https://github.com/nickitat)).
 * While read data from a hive text file, it would use the first line of hive text file to resize of number of input fields, and sometimes the fields number of first line is not matched with the hive table defined , such as the hive table is defined to have 3 columns, like `test_tbl(a Int32, b Int32, c Int32)`, but the first line of text file only has 2 fields, and in this suitation, the input fields will be resized to 2, and if the next line of the text file has 3 fields, then the third field can not be read but set a default value 0, which is not right. [#62086](https://github.com/ClickHouse/ClickHouse/pull/62086) ([KevinyhZou](https://github.com/KevinyhZou)).
-* CREATE AS copies the table's comment. [#62117](https://github.com/ClickHouse/ClickHouse/pull/62117) ([Pablo Marcos](https://github.com/pamarcos)).
+* `CREATE AS` copies the table's comment. [#62117](https://github.com/ClickHouse/ClickHouse/pull/62117) ([Pablo Marcos](https://github.com/pamarcos)).
 * Add query progress to table zookeeper. [#62152](https://github.com/ClickHouse/ClickHouse/pull/62152) ([JackyWoo](https://github.com/JackyWoo)).
 * Add ability to turn on trace collector (Real and CPU) server-wide. [#62189](https://github.com/ClickHouse/ClickHouse/pull/62189) ([alesapin](https://github.com/alesapin)).
 * Added setting `lightweight_deletes_sync` (default value: 2 - wait all replicas synchronously). It is similar to setting `mutations_sync` but affects only behaviour of lightweight deletes. [#62195](https://github.com/ClickHouse/ClickHouse/pull/62195) ([Anton Popov](https://github.com/CurtizJ)).

From 1f67c1d1b2a4b3b506fcf8e4d2e10dfc6e5ba97a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Tue, 30 Apr 2024 08:44:27 +0200
Subject: [PATCH 132/192] Remove a feature

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 28184cbfb5f..a251d3a20f5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,7 +28,6 @@
 * Add `SYSTEM UNLOAD PRIMARY KEY` to free up memory usage for a table's primary key. [#62738](https://github.com/ClickHouse/ClickHouse/pull/62738) ([Pablo Marcos](https://github.com/pamarcos)).
 * Added `value1`, `value2`, ..., `value10` columns to `system.text_log`. These columns contain values that were used to format the message. [#59619](https://github.com/ClickHouse/ClickHouse/pull/59619) ([Alexey Katsman](https://github.com/alexkats)).
 * Added persistent virtual column `_block_offset` which stores original number of row in block that was assigned at insert. Persistence of column `_block_offset` can be enabled by the MergeTree setting `enable_block_offset_column`. Added virtual column`_part_data_version` which contains either min block number or mutation version of part. Persistent virtual column `_block_number` is not considered experimental anymore. [#60676](https://github.com/ClickHouse/ClickHouse/pull/60676) ([Anton Popov](https://github.com/CurtizJ)).
-* Add `TRUNCATE ALL TABLES`. [#61862](https://github.com/ClickHouse/ClickHouse/pull/61862) ([豪肥肥](https://github.com/HowePa)).
 * Add a setting `input_format_json_throw_on_bad_escape_sequence`, disabling it allows saving bad escape sequences in JSON input formats. [#61889](https://github.com/ClickHouse/ClickHouse/pull/61889) ([Kruglov Pavel](https://github.com/Avogar)).
 
 #### Performance Improvement

From 3c1695b80b1864443c2297f43ddea1d3fafe7993 Mon Sep 17 00:00:00 2001
From: "joe09@foxmail.com" <joelzhou@lexin.com>
Date: Tue, 30 Apr 2024 15:04:22 +0800
Subject: [PATCH 133/192] [docs] Update ClickVisual official website link

---
 docs/en/interfaces/third-party/gui.md | 2 +-
 docs/ru/interfaces/third-party/gui.md | 2 +-
 docs/zh/interfaces/third-party/gui.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md
index 0b3ca3db3a9..be2d028e87f 100644
--- a/docs/en/interfaces/third-party/gui.md
+++ b/docs/en/interfaces/third-party/gui.md
@@ -172,7 +172,7 @@ Features:
 
 ### ClickVisual {#clickvisual}
 
-[ClickVisual](https://clickvisual.gocn.vip/) ClickVisual is a lightweight open source log query, analysis and alarm visualization platform.
+[ClickVisual](https://clickvisual.net/) ClickVisual is a lightweight open source log query, analysis and alarm visualization platform.
 
 Features:
 
diff --git a/docs/ru/interfaces/third-party/gui.md b/docs/ru/interfaces/third-party/gui.md
index 6bed32052ad..559e855393f 100644
--- a/docs/ru/interfaces/third-party/gui.md
+++ b/docs/ru/interfaces/third-party/gui.md
@@ -150,7 +150,7 @@ sidebar_label: "Визуальные интерфейсы от сторонни
 
 ### ClickVisual {#clickvisual}
 
-[ClickVisual](https://clickvisual.gocn.vip/) ClickVisual — это легкодоступная платформа для запросов, анализа и уведомлений. Документация на китайском.
+[ClickVisual](https://clickvisual.net/) ClickVisual — это легкодоступная платформа для запросов, анализа и уведомлений. Документация на китайском.
 
 Основные возможности:
 
diff --git a/docs/zh/interfaces/third-party/gui.md b/docs/zh/interfaces/third-party/gui.md
index 6cf1b99b640..92afd977db5 100644
--- a/docs/zh/interfaces/third-party/gui.md
+++ b/docs/zh/interfaces/third-party/gui.md
@@ -93,7 +93,7 @@ ClickHouse Web 界面 [Tabix](https://github.com/tabixio/tabix).
 
 ### ClickVisual {#clickvisual}
 
-[ClickVisual](https://clickvisual.gocn.vip/) ClickVisual是一款轻量级的开源日志查询、分析、报警的可视化平台！
+[ClickVisual](https://clickvisual.net/) ClickVisual是一款轻量级的开源日志查询、分析、报警的可视化平台！
 
 特征：
 

From 8c3a9bc9c7acc22befbd6df8e6f85d03bee0de3c Mon Sep 17 00:00:00 2001
From: Jordi Villar <jrdi.villar@gmail.com>
Date: Tue, 30 Apr 2024 09:23:25 +0200
Subject: [PATCH 134/192] Revert "Merge pull request #60598 from
 jrdi/week-default-mode"

This reverts commit 61df8aa4669bf51cc8e9697305806b0136e42926, reversing
changes made to c6d21aebb5eb901d0be58bee09b004fb91691ea7.
---
 docs/en/operations/settings/settings.md       | 11 ---
 .../functions/date-time-functions.md          | 14 ++-
 src/Common/DateLUTImpl.h                      | 14 +--
 src/Core/Settings.h                           |  1 -
 src/Core/SettingsChangesHistory.h             |  1 -
 src/Core/SettingsEnums.cpp                    |  4 -
 src/Core/SettingsEnums.h                      |  8 --
 src/Functions/DateTimeTransforms.h            | 96 +++++++++----------
 src/Functions/FunctionsTimeWindow.h           | 10 +-
 src/Functions/toStartOfInterval.cpp           | 17 +---
 .../03005_first_day_of_week.reference         |  4 -
 .../0_stateless/03005_first_day_of_week.sql   | 21 ----
 12 files changed, 64 insertions(+), 137 deletions(-)
 delete mode 100644 tests/queries/0_stateless/03005_first_day_of_week.reference
 delete mode 100644 tests/queries/0_stateless/03005_first_day_of_week.sql

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 5518fd7e1ec..4f158a4fd6e 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -4373,17 +4373,6 @@ Possible values:
 
 Default value: `ignore`.
 
-## first_day_of_week
-
-The first day of the week assumed by [`toStartOfInterval`](../../sql-reference/functions/date-time-functions.md#toStartOfInterval) function when using weeks as unit.
-
-Possible values:
-
-- Monday - Week starts on Monday
-- Sunday - Week starts on Sunday
-
-Default value: 'Monday'.
-
 ## optimize_move_to_prewhere {#optimize_move_to_prewhere}
 
 Enables or disables automatic [PREWHERE](../../sql-reference/statements/select/prewhere.md) optimization in [SELECT](../../sql-reference/statements/select/index.md) queries.
diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 51f841657d9..39971cbed7a 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -1052,7 +1052,7 @@ toStartOfWeek(t[, mode[, timezone]])
 **Arguments**
 
 - `t` - a [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md)
-- `mode` - determines the first day of the week as described in the [toWeek()](date-time-functions#toweek) function. Default: 0
+- `mode` - determines the first day of the week as described in the [toWeek()](date-time-functions#toweek) function
 - `timezone` - Optional parameter, it behaves like any other conversion function
 
 **Returned value**
@@ -1413,7 +1413,7 @@ toStartOfFifteenMinutes(toDateTime('2023-04-21 10:20:00')): 2023-04-21 10:15:00
 toStartOfFifteenMinutes(toDateTime('2023-04-21 10:23:00')): 2023-04-21 10:15:00
 ```
 
-## toStartOfInterval
+## toStartOfInterval(date_or_date_with_time, INTERVAL x unit \[, time_zone\])
 
 This function generalizes other `toStartOf*()` functions. For example,
 - `toStartOfInterval(t, INTERVAL 1 year)` returns the same as `toStartOfYear(t)`,
@@ -1440,8 +1440,6 @@ The calculation is performed relative to specific points in time:
 (*) hour intervals are special: the calculation is always performed relative to 00:00:00 (midnight) of the current day. As a result, only
     hour values between 1 and 23 are useful.
 
-If unit `week` was specified, `toStartOfInterval` assumes by default that weeks start on Monday. You can change this behavior with setting [`first_day_of_week`](../../operations/settings/settings.md/#first-day-of-week)
-
 **See Also**
 
 - [date_trunc](#date_trunc)
@@ -1675,7 +1673,7 @@ Like [fromDaysSinceYearZero](#fromDaysSinceYearZero) but returns a [Date32](../.
 Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 nanosecond.
 E.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for `day` unit, 0 months for `month` unit, 0 years for `year` unit.
 
-For an alternative to `age`, see function `date_diff`.
+For an alternative to `age`, see function `date\_diff`.
 
 **Syntax**
 
@@ -1749,9 +1747,9 @@ Result:
 Returns the count of the specified `unit` boundaries crossed between the `startdate` and the `enddate`.
 The difference is calculated using relative units, e.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for unit `day` (see [toRelativeDayNum](#torelativedaynum)), 1 month for unit `month` (see [toRelativeMonthNum](#torelativemonthnum)) and 1 year for unit `year` (see [toRelativeYearNum](#torelativeyearnum)).
 
-If unit `week` was specified, `date_diff` assumes that weeks start on Monday. Note that this behavior is different from that of function `toWeek()` in which weeks start by default on Sunday.
+If unit `week` was specified, `date\_diff` assumes that weeks start on Monday. Note that this behavior is different from that of function `toWeek()` in which weeks start by default on Sunday.
 
-For an alternative to `date_diff`, see function `age`.
+For an alternative to `date\_diff`, see function `age`.
 
 **Syntax**
 
@@ -2885,7 +2883,7 @@ Result:
 
 ## fromUnixTimestamp
 
-This function converts a Unix timestamp to a calendar date and a time of a day.
+This function converts a Unix timestamp to a calendar date and a time of a day. 
 
 It can be called in two ways:
 
diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h
index f969803b6c5..01cbae3d447 100644
--- a/src/Common/DateLUTImpl.h
+++ b/src/Common/DateLUTImpl.h
@@ -1048,20 +1048,16 @@ public:
 
     template <typename Date>
     requires std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>
-    auto toStartOfWeekInterval(Date d, UInt64 weeks, UInt8 week_mode) const
+    auto toStartOfWeekInterval(Date d, UInt64 weeks) const
     {
         if (weeks == 1)
-            return toFirstDayNumOfWeek(d, week_mode);
-
-        bool monday_first_mode = week_mode & static_cast<UInt8>(WeekModeFlag::MONDAY_FIRST);
-        // January 1st 1970 was Thursday so we need this 4-days offset to make weeks start on Monday, or
-        // 3 days to start on Sunday.
-        auto offset = monday_first_mode ? 4 : 3;
+            return toFirstDayNumOfWeek(d);
         UInt64 days = weeks * 7;
+        // January 1st 1970 was Thursday so we need this 4-days offset to make weeks start on Monday.
         if constexpr (std::is_same_v<Date, DayNum>)
-            return DayNum(offset + (d - offset) / days * days);
+            return DayNum(4 + (d - 4) / days * days);
         else
-            return ExtendedDayNum(static_cast<Int32>(offset + (d - offset) / days * days));
+            return ExtendedDayNum(static_cast<Int32>(4 + (d - 4) / days * days));
     }
 
     template <typename Date>
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index f332b51bf1d..7e010c046be 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -551,7 +551,6 @@ class IColumn;
     M(Bool, formatdatetime_parsedatetime_m_is_month_name, true, "Formatter '%M' in functions 'formatDateTime()' and 'parseDateTime()' print/parse the month name instead of minutes.", 0) \
     M(Bool, parsedatetime_parse_without_leading_zeros, true, "Formatters '%c', '%l' and '%k' in function 'parseDateTime()' parse months and hours without leading zeros.", 0) \
     M(Bool, formatdatetime_format_without_leading_zeros, false, "Formatters '%c', '%l' and '%k' in function 'formatDateTime()' print months and hours without leading zeros.", 0) \
-    M(FirstDayOfWeek, first_day_of_week, FirstDayOfWeek::Monday, "The first day of the week (Monday or Sunday) used by date/time functions (default: Monday).", 0) \
     \
     M(UInt64, max_partitions_per_insert_block, 100, "Limit maximum number of partitions in single INSERTed block. Zero means unlimited. Throw exception if the block contains too many partitions. This setting is a safety threshold, because using large number of partitions is a common misconception.", 0) \
     M(Bool, throw_on_max_partitions_per_insert_block, true, "Used with max_partitions_per_insert_block. If true (default), an exception will be thrown when max_partitions_per_insert_block is reached. If false, details of the insert query reaching this limit with the number of partitions will be logged. This can be useful if you're trying to understand the impact on users when changing max_partitions_per_insert_block.", 0) \
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index da6e4e11031..ac58f1e0052 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -92,7 +92,6 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
               {"query_cache_system_table_handling", "save", "throw", "The query cache no longer caches results of queries against system tables"},
               {"input_format_json_ignore_unnecessary_fields", false, true, "Ignore unnecessary fields and not parse them. Enabling this may not throw exceptions on json strings of invalid format or with duplicated fields"},
               {"input_format_hive_text_allow_variable_number_of_columns", false, true, "Ignore extra columns in Hive Text input (if file has more columns than expected) and treat missing fields in Hive Text input as default values."},
-              {"first_day_of_week", "Monday", "Monday", "Added a setting for the first day of the week for date/time functions"},
               {"allow_experimental_database_replicated", false, true, "Database engine Replicated is now in Beta stage"},
               {"temporary_data_in_cache_reserve_space_wait_lock_timeout_milliseconds", (10 * 60 * 1000), (10 * 60 * 1000), "Wait time to lock cache for sapce reservation in temporary data in filesystem cache"},
               {"azure_allow_parallel_part_upload", "true", "true", "Use multiple threads for azure multipart upload."},
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index 545719c3ca5..0caf6e8d609 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -229,8 +229,4 @@ IMPLEMENT_SETTING_ENUM(SQLSecurityType, ErrorCodes::BAD_ARGUMENTS,
     {{"DEFINER", SQLSecurityType::DEFINER},
      {"INVOKER", SQLSecurityType::INVOKER},
      {"NONE", SQLSecurityType::NONE}})
-
-IMPLEMENT_SETTING_ENUM(FirstDayOfWeek, ErrorCodes::BAD_ARGUMENTS,
-                       {{"Monday", FirstDayOfWeek::Monday},
-                        {"Sunday", FirstDayOfWeek::Sunday}})
 }
diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index b1d9403b15d..b17ff11d428 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -370,12 +370,4 @@ DECLARE_SETTING_ENUM(SchemaInferenceMode)
 DECLARE_SETTING_ENUM_WITH_RENAME(DateTimeOverflowBehavior, FormatSettings::DateTimeOverflowBehavior)
 
 DECLARE_SETTING_ENUM(SQLSecurityType)
-
-enum class FirstDayOfWeek
-{
-    Monday,
-    Sunday
-};
-
-DECLARE_SETTING_ENUM(FirstDayOfWeek)
 }
diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h
index ed2ed66b0b0..a489c3cb859 100644
--- a/src/Functions/DateTimeTransforms.h
+++ b/src/Functions/DateTimeTransforms.h
@@ -478,19 +478,19 @@ static constexpr auto TO_START_OF_INTERVAL_NAME = "toStartOfInterval";
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Nanosecond>
 {
-    static UInt32 execute(UInt16, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt16, Int64, const DateLUTImpl &, Int64)
     {
         throwDateIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(Int32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(Int32, Int64, const DateLUTImpl &, Int64)
     {
         throwDate32IsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(UInt32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt32, Int64, const DateLUTImpl &, Int64)
     {
         throwDateTimeIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static Int64 execute(Int64 t, Int64 nanoseconds, Int64 scale_multiplier, UInt8, const DateLUTImpl &)
+    static Int64 execute(Int64 t, Int64 nanoseconds, const DateLUTImpl &, Int64 scale_multiplier)
     {
         if (scale_multiplier < 1000000000)
         {
@@ -513,19 +513,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Nanosecond>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Microsecond>
 {
-    static UInt32 execute(UInt16, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt16, Int64, const DateLUTImpl &, Int64)
     {
         throwDateIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(Int32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(Int32, Int64, const DateLUTImpl &, Int64)
     {
         throwDate32IsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(UInt32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt32, Int64, const DateLUTImpl &, Int64)
     {
         throwDateTimeIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static Int64 execute(Int64 t, Int64 microseconds, Int64 scale_multiplier, UInt8, const DateLUTImpl &)
+    static Int64 execute(Int64 t, Int64 microseconds, const DateLUTImpl &, Int64 scale_multiplier)
     {
         if (scale_multiplier < 1000000)
         {
@@ -556,19 +556,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Microsecond>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Millisecond>
 {
-    static UInt32 execute(UInt16, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt16, Int64, const DateLUTImpl &, Int64)
     {
         throwDateIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(Int32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(Int32, Int64, const DateLUTImpl &, Int64)
     {
         throwDate32IsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(UInt32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt32, Int64, const DateLUTImpl &, Int64)
     {
         throwDateTimeIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static Int64 execute(Int64 t, Int64 milliseconds, Int64 scale_multiplier, UInt8, const DateLUTImpl &)
+    static Int64 execute(Int64 t, Int64 milliseconds, const DateLUTImpl &, Int64 scale_multiplier)
     {
         if (scale_multiplier < 1000)
         {
@@ -599,19 +599,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Millisecond>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Second>
 {
-    static UInt32 execute(UInt16, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt16, Int64, const DateLUTImpl &, Int64)
     {
         throwDateIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(Int32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(Int32, Int64, const DateLUTImpl &, Int64)
     {
         throwDate32IsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(UInt32 t, Int64 seconds, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt32 execute(UInt32 t, Int64 seconds, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfSecondInterval(t, seconds);
     }
-    static Int64 execute(Int64 t, Int64 seconds, Int64 scale_multiplier, UInt8, const DateLUTImpl & time_zone)
+    static Int64 execute(Int64 t, Int64 seconds, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
         return time_zone.toStartOfSecondInterval(t / scale_multiplier, seconds);
     }
@@ -620,19 +620,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Second>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Minute>
 {
-    static UInt32 execute(UInt16, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt16, Int64, const DateLUTImpl &, Int64)
     {
         throwDateIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(Int32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(Int32, Int64, const DateLUTImpl &, Int64)
     {
         throwDate32IsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(UInt32 t, Int64 minutes, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt32 execute(UInt32 t, Int64 minutes, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfMinuteInterval(t, minutes);
     }
-    static Int64 execute(Int64 t, Int64 minutes, Int64 scale_multiplier, UInt8, const DateLUTImpl & time_zone)
+    static Int64 execute(Int64 t, Int64 minutes, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
         return time_zone.toStartOfMinuteInterval(t / scale_multiplier, minutes);
     }
@@ -641,19 +641,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Minute>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Hour>
 {
-    static UInt32 execute(UInt16, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(UInt16, Int64, const DateLUTImpl &, Int64)
     {
         throwDateIsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(Int32, Int64, Int64, UInt8, const DateLUTImpl &)
+    static UInt32 execute(Int32, Int64, const DateLUTImpl &, Int64)
     {
         throwDate32IsNotSupported(TO_START_OF_INTERVAL_NAME);
     }
-    static UInt32 execute(UInt32 t, Int64 hours, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt32 execute(UInt32 t, Int64 hours, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfHourInterval(t, hours);
     }
-    static Int64 execute(Int64 t, Int64 hours, Int64 scale_multiplier, UInt8, const DateLUTImpl & time_zone)
+    static Int64 execute(Int64 t, Int64 hours, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
         return time_zone.toStartOfHourInterval(t / scale_multiplier, hours);
     }
@@ -662,19 +662,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Hour>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Day>
 {
-    static UInt32 execute(UInt16 d, Int64 days, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt32 execute(UInt16 d, Int64 days, const DateLUTImpl & time_zone, Int64)
     {
         return static_cast<UInt32>(time_zone.toStartOfDayInterval(ExtendedDayNum(d), days));
     }
-    static UInt32 execute(Int32 d, Int64 days, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt32 execute(Int32 d, Int64 days, const DateLUTImpl & time_zone, Int64)
     {
         return static_cast<UInt32>(time_zone.toStartOfDayInterval(ExtendedDayNum(d), days));
     }
-    static UInt32 execute(UInt32 t, Int64 days, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt32 execute(UInt32 t, Int64 days, const DateLUTImpl & time_zone, Int64)
     {
         return static_cast<UInt32>(time_zone.toStartOfDayInterval(time_zone.toDayNum(t), days));
     }
-    static Int64 execute(Int64 t, Int64 days, Int64 scale_multiplier, UInt8, const DateLUTImpl & time_zone)
+    static Int64 execute(Int64 t, Int64 days, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
         return time_zone.toStartOfDayInterval(time_zone.toDayNum(t / scale_multiplier), days);
     }
@@ -683,40 +683,40 @@ struct ToStartOfInterval<IntervalKind::Kind::Day>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Week>
 {
-    static UInt16 execute(UInt16 d, Int64 weeks, Int64, UInt8 week_mode, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt16 d, Int64 weeks, const DateLUTImpl & time_zone, Int64)
     {
-        return time_zone.toStartOfWeekInterval(DayNum(d), weeks, week_mode);
+        return time_zone.toStartOfWeekInterval(DayNum(d), weeks);
     }
-    static UInt16 execute(Int32 d, Int64 weeks, Int64, UInt8 week_mode, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int32 d, Int64 weeks, const DateLUTImpl & time_zone, Int64)
     {
-        return time_zone.toStartOfWeekInterval(ExtendedDayNum(d), weeks, week_mode);
+        return time_zone.toStartOfWeekInterval(ExtendedDayNum(d), weeks);
     }
-    static UInt16 execute(UInt32 t, Int64 weeks, Int64, UInt8 week_mode, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt32 t, Int64 weeks, const DateLUTImpl & time_zone, Int64)
     {
-        return time_zone.toStartOfWeekInterval(time_zone.toDayNum(t), weeks, week_mode);
+        return time_zone.toStartOfWeekInterval(time_zone.toDayNum(t), weeks);
     }
-    static UInt16 execute(Int64 t, Int64 weeks, Int64 scale_multiplier, UInt8 week_mode, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int64 t, Int64 weeks, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
-        return time_zone.toStartOfWeekInterval(time_zone.toDayNum(t / scale_multiplier), weeks, week_mode);
+        return time_zone.toStartOfWeekInterval(time_zone.toDayNum(t / scale_multiplier), weeks);
     }
 };
 
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Month>
 {
-    static UInt16 execute(UInt16 d, Int64 months, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt16 d, Int64 months, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfMonthInterval(DayNum(d), months);
     }
-    static UInt16 execute(Int32 d, Int64 months, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int32 d, Int64 months, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfMonthInterval(ExtendedDayNum(d), months);
     }
-    static UInt16 execute(UInt32 t, Int64 months, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt32 t, Int64 months, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfMonthInterval(time_zone.toDayNum(t), months);
     }
-    static UInt16 execute(Int64 t, Int64 months, Int64 scale_multiplier, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int64 t, Int64 months, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
         return time_zone.toStartOfMonthInterval(time_zone.toDayNum(t / scale_multiplier), months);
     }
@@ -725,19 +725,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Month>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Quarter>
 {
-    static UInt16 execute(UInt16 d, Int64 quarters, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt16 d, Int64 quarters, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfQuarterInterval(DayNum(d), quarters);
     }
-    static UInt16 execute(Int32 d, Int64 quarters, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int32 d, Int64 quarters, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfQuarterInterval(ExtendedDayNum(d), quarters);
     }
-    static UInt16 execute(UInt32 t, Int64 quarters, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt32 t, Int64 quarters, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfQuarterInterval(time_zone.toDayNum(t), quarters);
     }
-    static UInt16 execute(Int64 t, Int64 quarters, Int64 scale_multiplier, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int64 t, Int64 quarters, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
         return time_zone.toStartOfQuarterInterval(time_zone.toDayNum(t / scale_multiplier), quarters);
     }
@@ -746,19 +746,19 @@ struct ToStartOfInterval<IntervalKind::Kind::Quarter>
 template <>
 struct ToStartOfInterval<IntervalKind::Kind::Year>
 {
-    static UInt16 execute(UInt16 d, Int64 years, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt16 d, Int64 years, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfYearInterval(DayNum(d), years);
     }
-    static UInt16 execute(Int32 d, Int64 years, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int32 d, Int64 years, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfYearInterval(ExtendedDayNum(d), years);
     }
-    static UInt16 execute(UInt32 t, Int64 years, Int64, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(UInt32 t, Int64 years, const DateLUTImpl & time_zone, Int64)
     {
         return time_zone.toStartOfYearInterval(time_zone.toDayNum(t), years);
     }
-    static UInt16 execute(Int64 t, Int64 years, Int64 scale_multiplier, UInt8, const DateLUTImpl & time_zone)
+    static UInt16 execute(Int64 t, Int64 years, const DateLUTImpl & time_zone, Int64 scale_multiplier)
     {
         return time_zone.toStartOfYearInterval(time_zone.toDayNum(t / scale_multiplier), years);
     }
diff --git a/src/Functions/FunctionsTimeWindow.h b/src/Functions/FunctionsTimeWindow.h
index b3c68cd0c18..6183d25c8bd 100644
--- a/src/Functions/FunctionsTimeWindow.h
+++ b/src/Functions/FunctionsTimeWindow.h
@@ -39,17 +39,9 @@ struct ToStartOfTransform;
     TRANSFORM_DATE(Year)
     TRANSFORM_DATE(Quarter)
     TRANSFORM_DATE(Month)
+    TRANSFORM_DATE(Week)
 #undef TRANSFORM_DATE
 
-    template <>
-    struct ToStartOfTransform<IntervalKind::Kind::Week>
-    {
-        static auto execute(UInt32 t, UInt64 delta, const DateLUTImpl & time_zone)
-        {
-            return time_zone.toStartOfWeekInterval(time_zone.toDayNum(t), delta, /*week_mode*/ 1);
-        }
-    };
-
     template <>
     struct ToStartOfTransform<IntervalKind::Kind::Day>
     {
diff --git a/src/Functions/toStartOfInterval.cpp b/src/Functions/toStartOfInterval.cpp
index f125e439766..7f25a317466 100644
--- a/src/Functions/toStartOfInterval.cpp
+++ b/src/Functions/toStartOfInterval.cpp
@@ -2,7 +2,6 @@
 #include <Columns/ColumnsNumber.h>
 #include <Common/DateLUTImpl.h>
 #include <Common/IntervalKind.h>
-#include <Core/SettingsEnums.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeDateTime64.h>
@@ -10,7 +9,6 @@
 #include <Functions/DateTimeTransforms.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/IFunction.h>
-#include <Interpreters/Context.h>
 #include <IO/WriteHelpers.h>
 
 
@@ -28,13 +26,9 @@ namespace ErrorCodes
 class FunctionToStartOfInterval : public IFunction
 {
 public:
-    static constexpr auto name = "toStartOfInterval";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionToStartOfInterval>(); }
 
-    static FunctionPtr create(ContextPtr context) { return std::make_shared<FunctionToStartOfInterval>(context); }
-    explicit FunctionToStartOfInterval(ContextPtr context)
-        : first_day_of_week(context->getSettingsRef().first_day_of_week)
-    {
-    }
+    static constexpr auto name = "toStartOfInterval";
     String getName() const override { return name; }
     bool isVariadic() const override { return true; }
     size_t getNumberOfArguments() const override { return 0; }
@@ -253,16 +247,13 @@ private:
         auto & result_data = col_to->getData();
         result_data.resize(size);
 
-        const Int64 scale_multiplier = DecimalUtils::scaleMultiplier<DateTime64>(scale);
-        const UInt8 week_mode = (first_day_of_week == FirstDayOfWeek::Monday) ? 1 : 0;
+        Int64 scale_multiplier = DecimalUtils::scaleMultiplier<DateTime64>(scale);
 
         for (size_t i = 0; i != size; ++i)
-            result_data[i] = static_cast<ResultFieldType>(ToStartOfInterval<unit>::execute(time_data[i], num_units, scale_multiplier, week_mode, time_zone));
+            result_data[i] = static_cast<ResultFieldType>(ToStartOfInterval<unit>::execute(time_data[i], num_units, time_zone, scale_multiplier));
 
         return result_col;
     }
-
-    const FirstDayOfWeek first_day_of_week;
 };
 
 REGISTER_FUNCTION(ToStartOfInterval)
diff --git a/tests/queries/0_stateless/03005_first_day_of_week.reference b/tests/queries/0_stateless/03005_first_day_of_week.reference
deleted file mode 100644
index 381802c07a2..00000000000
--- a/tests/queries/0_stateless/03005_first_day_of_week.reference
+++ /dev/null
@@ -1,4 +0,0 @@
--- toStartOfInterval
-2024-01-02 00:00:00	2024-01-01	2023-12-25
-2024-01-02 00:00:00	2024-01-01	2023-12-25
-2024-01-02 00:00:00	2023-12-31	2023-12-24
diff --git a/tests/queries/0_stateless/03005_first_day_of_week.sql b/tests/queries/0_stateless/03005_first_day_of_week.sql
deleted file mode 100644
index c7213875fdb..00000000000
--- a/tests/queries/0_stateless/03005_first_day_of_week.sql
+++ /dev/null
@@ -1,21 +0,0 @@
--- Testing behavior of date/time functions under setting 'first_day_of_week'.
-
-SELECT '-- toStartOfInterval';
-
--- default behavior
-SELECT
-    toDateTime('2024-01-02 00:00:00', 'UTC') dt,
-    toStartOfInterval(dt, INTERVAL 1 WEEK), -- Monday, Jan 01
-    toStartOfInterval(dt, INTERVAL 2 WEEK); -- Monday, Dec 25
-
-SELECT
-    toDateTime('2024-01-02 00:00:00', 'UTC') dt,
-    toStartOfInterval(dt, INTERVAL 1 WEEK), -- Monday, Jan 01
-    toStartOfInterval(dt, INTERVAL 2 WEEK) -- Monday, Dec 25
-SETTINGS first_day_of_week = 'Monday';
-
-SELECT
-    toDateTime('2024-01-02 00:00:00', 'UTC') dt,
-    toStartOfInterval(dt, INTERVAL 1 WEEK), -- Sunday, Dec 31
-    toStartOfInterval(dt, INTERVAL 2 WEEK) -- Sunday, Dec 24
-SETTINGS first_day_of_week = 'Sunday';

From 156f17122477792c596ea0ba159c53f2a0995262 Mon Sep 17 00:00:00 2001
From: Smita Kulkarni <Smita.Kulkarni@clickhouse.com>
Date: Tue, 30 Apr 2024 10:02:46 +0200
Subject: [PATCH 135/192] Fix azure backup flaky test

---
 .../test_backup_restore_azure_blob_storage/test.py            | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py
index 09871b447c9..01a28de579b 100644
--- a/tests/integration/test_backup_restore_azure_blob_storage/test.py
+++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py
@@ -302,7 +302,6 @@ def test_backup_restore_with_named_collection_azure_conf2(cluster):
 
 def test_backup_restore_on_merge_tree(cluster):
     node = cluster.instances["node"]
-    port = cluster.env_variables["AZURITE_PORT"]
     azure_query(
         node,
         f"CREATE TABLE test_simple_merge_tree(key UInt64, data String) Engine = MergeTree() ORDER BY tuple() SETTINGS storage_policy='blob_storage_policy'",
@@ -321,3 +320,6 @@ def test_backup_restore_on_merge_tree(cluster):
     assert (
         azure_query(node, f"SELECT * from test_simple_merge_tree_restored") == "1\ta\n"
     )
+    azure_query(node, f"DROP TABLE test_simple_merge_tree")
+    azure_query(node, f"DROP TABLE test_simple_merge_tree_restored")
+

From 821052dadcff1fad9ba78dbe9e4945d910dd10c9 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Tue, 30 Apr 2024 08:11:50 +0000
Subject: [PATCH 136/192] Automatic style fix

---
 tests/integration/test_backup_restore_azure_blob_storage/test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py
index 01a28de579b..bfc7044e41a 100644
--- a/tests/integration/test_backup_restore_azure_blob_storage/test.py
+++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py
@@ -322,4 +322,3 @@ def test_backup_restore_on_merge_tree(cluster):
     )
     azure_query(node, f"DROP TABLE test_simple_merge_tree")
     azure_query(node, f"DROP TABLE test_simple_merge_tree_restored")
-

From 93df0644ce744cbb3891cf14072fd5686596424d Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 30 Apr 2024 12:10:12 +0300
Subject: [PATCH 137/192] Fixes for the binary tidy build

---
 src/Functions/generateUUIDv7.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index bb12f250da5..623e6cc3c77 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -76,7 +76,7 @@ void setVariant(UUID & uuid)
 struct FillAllRandomPolicy
 {
     static constexpr auto name = "generateUUIDv7NonMonotonic";
-    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), and a random field (74 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit). This function is the fastest generateUUIDv7* function but it gives no monotonocity guarantees within a timestamp.";
+    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), and a random field (74 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit). This function is the fastest generateUUIDv7* function but it gives no monotonicity guarantees within a timestamp.)";
     struct Data
     {
         void generate(UUID & uuid, uint64_t ts)
@@ -136,9 +136,9 @@ struct CounterFields
 struct GlobalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7";
-    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.";
+    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)";
 
-    /// Guarantee counter monotonocity within one timestamp across all threads generating UUIDv7 simultaneously.
+    /// Guarantee counter monotonicity within one timestamp across all threads generating UUIDv7 simultaneously.
     struct Data
     {
         static inline CounterFields fields;
@@ -159,9 +159,9 @@ struct GlobalCounterPolicy
 struct ThreadLocalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7ThreadMonotonic";
-    static constexpr auto doc_description = "Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version \"7\" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field \"2\", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.";
+    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.)";
 
-    /// Guarantee counter monotonocity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads.
+    /// Guarantee counter monotonicity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads.
     struct Data
     {
         static inline thread_local CounterFields fields;
@@ -287,4 +287,3 @@ REGISTER_FUNCTION(GenerateUUIDv7)
     registerUUIDv7Generator<FillAllRandomPolicy>(factory);
 }
 }
-

From 6af89ecf120a71947725b547176391f7e59b44d8 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 30 Apr 2024 10:02:19 +0000
Subject: [PATCH 138/192] Revert "Disable s390x build"

This reverts commit 116393c9516d7ea33544dd06488f07bcd5ca7c9b.
---
 .github/workflows/master.yml              | 19 +++++++++----------
 docker/packager/binary-builder/Dockerfile | 13 ++++++-------
 docker/packager/packager                  | 17 ++++++++---------
 tests/ci/ci_config.py                     | 17 ++++++++---------
 4 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index 100ec3b3b2c..9a719a205d4 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -238,15 +238,14 @@ jobs:
       build_name: binary_riscv64
       data: ${{ needs.RunConfig.outputs.data }}
       checkout_depth: 0
-  # disabled because s390x refused to build in the migration to OpenSSL
-  # BuilderBinS390X:
-  #   needs: [RunConfig, BuilderDebRelease]
-  #   if: ${{ !failure() && !cancelled() }}
-  #   uses: ./.github/workflows/reusable_build.yml
-  #   with:
-  #     build_name: binary_s390x
-  #     data: ${{ needs.RunConfig.outputs.data }}
-  #     checkout_depth: 0
+  BuilderBinS390X:
+    needs: [RunConfig, BuilderDebRelease]
+    if: ${{ !failure() && !cancelled() }}
+    uses: ./.github/workflows/reusable_build.yml
+    with:
+      build_name: binary_s390x
+      data: ${{ needs.RunConfig.outputs.data }}
+      checkout_depth: 0
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@@ -297,7 +296,7 @@ jobs:
       - BuilderBinFreeBSD
       - BuilderBinPPC64
       - BuilderBinRISCV64
-      # - BuilderBinS390X # disabled because s390x refused to build in the migration to OpenSSL
+      - BuilderBinS390X
       - BuilderBinAmd64Compat
       - BuilderBinAarch64V80Compat
       - BuilderBinClangTidy
diff --git a/docker/packager/binary-builder/Dockerfile b/docker/packager/binary-builder/Dockerfile
index 641a3e23492..7d6acdcd856 100644
--- a/docker/packager/binary-builder/Dockerfile
+++ b/docker/packager/binary-builder/Dockerfile
@@ -43,14 +43,13 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
 # Download toolchain and SDK for Darwin
 RUN curl -sL -O https://github.com/phracker/MacOSX-SDKs/releases/download/11.3/MacOSX11.0.sdk.tar.xz
 
- # disabled because s390x refused to build in the migration to OpenSSL
 # Download and install mold 2.0 for s390x build
-# RUN curl -Lo /tmp/mold.tar.gz "https://github.com/rui314/mold/releases/download/v2.0.0/mold-2.0.0-x86_64-linux.tar.gz" \
-#     && mkdir /tmp/mold \
-#     && tar -xzf /tmp/mold.tar.gz -C /tmp/mold \
-#     && cp -r /tmp/mold/mold*/* /usr \
-#     && rm -rf /tmp/mold \
-#     && rm /tmp/mold.tar.gz
+RUN curl -Lo /tmp/mold.tar.gz "https://github.com/rui314/mold/releases/download/v2.0.0/mold-2.0.0-x86_64-linux.tar.gz" \
+    && mkdir /tmp/mold \
+    && tar -xzf /tmp/mold.tar.gz -C /tmp/mold \
+    && cp -r /tmp/mold/mold*/* /usr \
+    && rm -rf /tmp/mold \
+    && rm /tmp/mold.tar.gz
 
 # Architecture of the image when BuildKit/buildx is used
 ARG TARGETARCH
diff --git a/docker/packager/packager b/docker/packager/packager
index aae297c6359..23fc26bc1a4 100755
--- a/docker/packager/packager
+++ b/docker/packager/packager
@@ -148,7 +148,7 @@ def parse_env_variables(
     FREEBSD_SUFFIX = "-freebsd"
     PPC_SUFFIX = "-ppc64le"
     RISCV_SUFFIX = "-riscv64"
-    # S390X_SUFFIX = "-s390x"  # disabled because s390x refused to build in the migration to OpenSSL
+    S390X_SUFFIX = "-s390x"
     AMD64_COMPAT_SUFFIX = "-amd64-compat"
     AMD64_MUSL_SUFFIX = "-amd64-musl"
 
@@ -166,7 +166,7 @@ def parse_env_variables(
     is_cross_arm_v80compat = compiler.endswith(ARM_V80COMPAT_SUFFIX)
     is_cross_ppc = compiler.endswith(PPC_SUFFIX)
     is_cross_riscv = compiler.endswith(RISCV_SUFFIX)
-    # is_cross_s390x = compiler.endswith(S390X_SUFFIX)  # disabled because s390x refused to build in the migration to OpenSSL
+    is_cross_s390x = compiler.endswith(S390X_SUFFIX)
     is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX)
     is_amd64_compat = compiler.endswith(AMD64_COMPAT_SUFFIX)
     is_amd64_musl = compiler.endswith(AMD64_MUSL_SUFFIX)
@@ -230,12 +230,11 @@ def parse_env_variables(
         cmake_flags.append(
             "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-riscv64.cmake"
         )
-    # disabled because s390x refused to build in the migration to OpenSSL
-    # elif is_cross_s390x:
-    #     cc = compiler[: -len(S390X_SUFFIX)]
-    #     cmake_flags.append(
-    #         "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-s390x.cmake"
-    #     )
+    elif is_cross_s390x:
+        cc = compiler[: -len(S390X_SUFFIX)]
+        cmake_flags.append(
+            "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-s390x.cmake"
+        )
     elif is_amd64_compat:
         cc = compiler[: -len(AMD64_COMPAT_SUFFIX)]
         result.append("DEB_ARCH=amd64")
@@ -411,7 +410,7 @@ def parse_args() -> argparse.Namespace:
             "clang-17-aarch64-v80compat",
             "clang-17-ppc64le",
             "clang-17-riscv64",
-            # "clang-17-s390x",  # disabled because s390x refused to build in the migration to OpenSSL
+            "clang-17-s390x",
             "clang-17-amd64-compat",
             "clang-17-amd64-musl",
             "clang-17-freebsd",
diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index b23d3d511fd..fd36cc4fbfb 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -78,7 +78,7 @@ class Build(metaclass=WithIter):
     BINARY_AMD64_COMPAT = "binary_amd64_compat"
     BINARY_AMD64_MUSL = "binary_amd64_musl"
     BINARY_RISCV64 = "binary_riscv64"
-    # BINARY_S390X = "binary_s390x" # disabled because s390x refused to build in the migration to OpenSSL
+    BINARY_S390X = "binary_s390x"
     FUZZERS = "fuzzers"
 
 
@@ -1029,13 +1029,12 @@ CI_CONFIG = CIConfig(
             package_type="binary",
             static_binary_name="riscv64",
         ),
-        # disabled because s390x refused to build in the migration to OpenSSL
-        # Build.BINARY_S390X: BuildConfig(
-        #     name=Build.BINARY_S390X,
-        #     compiler="clang-17-s390x",
-        #     package_type="binary",
-        #     static_binary_name="s390x",
-        # ),
+        Build.BINARY_S390X: BuildConfig(
+            name=Build.BINARY_S390X,
+            compiler="clang-17-s390x",
+            package_type="binary",
+            static_binary_name="s390x",
+        ),
         Build.FUZZERS: BuildConfig(
             name=Build.FUZZERS,
             compiler="clang-17",
@@ -1065,7 +1064,7 @@ CI_CONFIG = CIConfig(
                 Build.BINARY_DARWIN_AARCH64,
                 Build.BINARY_PPC64LE,
                 Build.BINARY_RISCV64,
-                # Build.BINARY_S390X, # disabled because s390x refused to build in the migration to OpenSSL
+                Build.BINARY_S390X,
                 Build.BINARY_AMD64_COMPAT,
                 Build.BINARY_AMD64_MUSL,
                 Build.PACKAGE_RELEASE_COVERAGE,

From 5660769a4bcadf8323e237b340f51c1763de6237 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 30 Apr 2024 10:57:36 +0000
Subject: [PATCH 139/192] Cosmetics, pt. V

---
 docs/en/sql-reference/functions/uuid-functions.md | 2 +-
 src/Functions/generateUUIDv7.cpp                  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index 0ff3878dd91..d1b833c2439 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -203,7 +203,7 @@ SELECT generateUUIDv7ThreadMonotonic(1), generateUUIDv7ThreadMonotonic(2);
 
 Generates a [UUID](../data-types/uuid.md) of [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04).
 
-The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits) and random values (76 bits, including a 2-bit variant field "2").
+The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits) and a random field (76 bits, including a 2-bit variant field "2").
 
 This function is the fastest `generateUUIDv7*` function but it gives no monotonicity guarantees within a timestamp.
 
diff --git a/src/Functions/generateUUIDv7.cpp b/src/Functions/generateUUIDv7.cpp
index 623e6cc3c77..411a3a076ac 100644
--- a/src/Functions/generateUUIDv7.cpp
+++ b/src/Functions/generateUUIDv7.cpp
@@ -76,7 +76,7 @@ void setVariant(UUID & uuid)
 struct FillAllRandomPolicy
 {
     static constexpr auto name = "generateUUIDv7NonMonotonic";
-    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), and a random field (74 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit). This function is the fastest generateUUIDv7* function but it gives no monotonicity guarantees within a timestamp.)";
+    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), and a random field (74 bit, including a 2-bit variant field "2") to distinguish UUIDs within a millisecond. This function is the fastest generateUUIDv7* function but it gives no monotonicity guarantees within a timestamp.)";
     struct Data
     {
         void generate(UUID & uuid, uint64_t ts)
@@ -136,7 +136,7 @@ struct CounterFields
 struct GlobalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7";
-    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)";
+    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. Function generateUUIDv7 guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.)";
 
     /// Guarantee counter monotonicity within one timestamp across all threads generating UUIDv7 simultaneously.
     struct Data
@@ -159,7 +159,7 @@ struct GlobalCounterPolicy
 struct ThreadLocalCounterPolicy
 {
     static constexpr auto name = "generateUUIDv7ThreadMonotonic";
-    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.)";
+    static constexpr auto doc_description = R"(Generates a UUID of version 7. The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit, including a variant field "2", 2 bit) to distinguish UUIDs within a millisecond, and a random field (32 bits). For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes. In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value. This function behaves like generateUUIDv7 but gives no guarantee on counter monotony across different simultaneous requests. Monotonicity within one timestamp is guaranteed only within the same thread calling this function to generate UUIDs.)";
 
     /// Guarantee counter monotonicity within one timestamp within the same thread. Faster than GlobalCounterPolicy if a query uses multiple threads.
     struct Data

From 3bf29828dc47c68894ab12f53d5ad3998a780e52 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 30 Apr 2024 11:16:06 +0000
Subject: [PATCH 140/192] Add a warning about the UUID sort order

---
 docs/en/sql-reference/data-types/uuid.md | 43 +++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/data-types/uuid.md b/docs/en/sql-reference/data-types/uuid.md
index 75e163f5063..59d7b0d7549 100644
--- a/docs/en/sql-reference/data-types/uuid.md
+++ b/docs/en/sql-reference/data-types/uuid.md
@@ -8,7 +8,8 @@ sidebar_label: UUID
 
 A Universally Unique Identifier (UUID) is a 16-byte value used to identify records. For detailed information about UUIDs, see [Wikipedia](https://en.wikipedia.org/wiki/Universally_unique_identifier).
 
-While different UUID variants exist (see [here](https://datatracker.ietf.org/doc/html/draft-ietf-uuidrev-rfc4122bis)), ClickHouse does not validate that inserted UUIDs conform to a particular variant. UUIDs are internally treated as a sequence of 16 random bytes with [8-4-4-4-12 representation](https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation) at SQL level.
+While different UUID variants exist (see [here](https://datatracker.ietf.org/doc/html/draft-ietf-uuidrev-rfc4122bis)), ClickHouse does not validate that inserted UUIDs conform to a particular variant.
+UUIDs are internally treated as a sequence of 16 random bytes with [8-4-4-4-12 representation](https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation) at SQL level.
 
 Example UUID value:
 
@@ -22,6 +23,46 @@ The default UUID is all-zero. It is used, for example, when a new record is inse
 00000000-0000-0000-0000-000000000000
 ```
 
+Due to historical reasons, UUIDs are sorted by their second half (which is unintuitive).
+UUIDs should therefore not be used in an primary key (or sorting key) of a table, or as partition key.
+
+Example:
+
+``` sql
+CREATE TABLE tab (uuid UUID) ENGINE = Memory;
+INSERT INTO tab SELECT generateUUIDv4() FROM numbers(50);
+SELECT * FROM tab ORDER BY uuid;
+```
+
+Result:
+
+``` text
+┌─uuid─────────────────────────────────┐
+│ 36a0b67c-b74a-4640-803b-e44bb4547e3c │
+│ 3a00aeb8-2605-4eec-8215-08c0ecb51112 │
+│ 3fda7c49-282e-421a-85ab-c5684ef1d350 │
+│ 16ab55a7-45f6-44a8-873c-7a0b44346b3e │
+│ e3776711-6359-4f22-878d-bf290d052c85 │
+│ 1be30226-57b2-4739-88ec-5e3d490090f2 │
+│ f65853a9-4375-4f0e-8b96-906ff622ed3c │
+│ d5a0c7a6-79c6-4107-8bb8-df85915edcb7 │
+│ 258e6068-17d1-4a1a-8be3-ed2ceb21815c │
+│ 04b0f6a9-1f7b-4a42-8bfc-62f37b8a32b8 │
+│ 9924f0d9-9c16-43a9-8f08-0944ab495aed │
+│ 6720dc14-4eab-4e3e-8f0c-10c4ae8d2673 │
+│ 5ddadb52-0452-4f5d-9030-c3f969af93a4 │
+│                [...]                 │
+│ 2dde30e6-59a1-48f8-b260-eb37921185b6 │
+│ d5402a1b-77b3-4897-b288-29edf5c3ed12 │
+│ 01843939-3ba7-4fea-b2aa-45f9a6f1e057 │
+│ 9eceda2f-6946-40e3-b725-16f2709ca41a │
+│ 03644f74-47ba-4020-b865-be5fd4c8c7ff │
+│ ce3bc93d-ab19-4c74-b8cc-737cb9212099 │
+│ b7ad6c91-23d6-4b5e-b8e4-a52297490b56 │
+│ 06892f64-cc2d-45f3-bf86-f5c5af5768a9 │
+└──────────────────────────────────────┘
+```
+
 ## Generating UUIDs
 
 ClickHouse provides the [generateUUIDv4](../../sql-reference/functions/uuid-functions.md) function to generate random UUID version 4 values.

From eacbf5fe100e3e4c2e31c1add64669015ed1b2da Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 30 Apr 2024 14:32:03 +0200
Subject: [PATCH 141/192] Fix premature server listen for connections

---
 programs/server/Server.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 260dcbe4c16..326f632d61d 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1573,8 +1573,11 @@ try
 
                 global_context->reloadQueryMaskingRulesIfChanged(config);
 
-                std::lock_guard lock(servers_lock);
-                updateServers(*config, server_pool, async_metrics, servers, servers_to_start_before_tables);
+                if (global_context->isServerCompletelyStarted())
+                {
+                    std::lock_guard lock(servers_lock);
+                    updateServers(*config, server_pool, async_metrics, servers, servers_to_start_before_tables);
+                }
             }
 
             global_context->updateStorageConfiguration(*config);

From 9c3775b7c61512bca7e438db10b0703c9a2dc627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Tue, 30 Apr 2024 14:32:32 +0200
Subject: [PATCH 142/192] More fixes

---
 tests/clickhouse-test | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 63d2fbc1b68..2324fa0b396 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -1470,11 +1470,14 @@ class TestCase:
             args.collect_per_test_coverage
             and BuildFlags.SANITIZE_COVERAGE in args.build_flags
         ):
-            clickhouse_execute(
-                args,
-                f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', coverageCurrent()",
-                retry_error_codes=True,
-            )
+            try:
+                clickhouse_execute(
+                    args,
+                    f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', coverageCurrent()",
+                    retry_error_codes=True,
+                )
+            except Exception as e:
+                print("Cannot insert coverage data: ", str(e))
 
             # Check for dumped coverage files
             file_pattern = "coverage.*"
@@ -1484,7 +1487,7 @@ class TestCase:
                     body = read_file_as_binary_string(file_path)
                     clickhouse_execute(
                         args,
-                        f"INSERT INTO system.coverage_log SETTINGS async_insert=1, wait_for_async_insert=0 SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary",
+                        f"INSERT INTO system.coverage_log SETTINGS async_insert=1, wait_for_async_insert=0, async_insert_busy_timeout_min_ms=200, async_insert_busy_timeout_max_ms=1000 SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary",
                         body=body,
                         retry_error_codes=True,
                     )

From 36014f739ad15743b7cfc186d3f6b62565b01b79 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Tue, 30 Apr 2024 14:52:43 +0200
Subject: [PATCH 143/192] Rename

---
 src/Interpreters/Session.cpp | 2 +-
 src/Interpreters/Session.h   | 2 +-
 src/Server/TCPHandler.cpp    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp
index 527efd73420..396562189e0 100644
--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@@ -366,7 +366,7 @@ void Session::authenticate(const Credentials & credentials_, const Poco::Net::So
     prepared_client_info->current_address = address;
 }
 
-void Session::isUserStillValid()
+void Session::checkIfUserIsStillValid()
 {
     if (user && user->valid_until)
     {
diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index 7b65876c1eb..242dae8ab9c 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -55,7 +55,7 @@ public:
 
     // Checks if user valid_until is higher than current time.
     // Throws exception if valid_until is less than current time.
-    void isUserStillValid();
+    void checkIfUserIsStillValid();
 
     /// Writes a row about login failure into session log (if enabled)
     void onAuthenticationFailure(const std::optional<String> & user_name, const Poco::Net::SocketAddress & address_, const Exception & e);
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 6cc3cde4de9..a143b6da670 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -416,7 +416,7 @@ void TCPHandler::runImpl()
             }
 
             if (!is_interserver_mode)
-                session->isUserStillValid();
+                session->checkIfUserIsStillValid();
 
             query_context->setExternalTablesInitializer([this] (ContextPtr context)
             {

From 185e715fc139e5c0f0fc28b1719840c1b10a54c9 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <thevar1able@users.noreply.github.com>
Date: Tue, 30 Apr 2024 14:54:34 +0200
Subject: [PATCH 144/192] Update src/Interpreters/Session.h

Co-authored-by: Nikolay Degterinsky <43110995+evillique@users.noreply.github.com>
---
 src/Interpreters/Session.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index 242dae8ab9c..14f6f806acd 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -53,8 +53,8 @@ public:
     void authenticate(const String & user_name, const String & password, const Poco::Net::SocketAddress & address);
     void authenticate(const Credentials & credentials_, const Poco::Net::SocketAddress & address_);
 
-    // Checks if user valid_until is higher than current time.
-    // Throws exception if valid_until is less than current time.
+    // Verifies whether the user's validity extends beyond the current time.
+    // Throws an exception if the user's validity has expired.
     void checkIfUserIsStillValid();
 
     /// Writes a row about login failure into session log (if enabled)

From c820bc31e5e0b970384dfca858117d47b7de0f89 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 30 Apr 2024 13:35:12 +0000
Subject: [PATCH 145/192] Fix memory sanitizer report

---
 src/Functions/FunctionsCodingUUID.cpp       | 2 ++
 tests/queries/0_stateless/00396_uuid_v7.sql | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index 17ab19f85f5..76d7edf5277 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -336,6 +336,7 @@ public:
     String getName() const override { return name; }
     size_t getNumberOfArguments() const override { return 0; }
     bool useDefaultImplementationForConstants() const override { return true; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
     bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
     bool isVariadic() const override { return true; }
@@ -413,6 +414,7 @@ public:
     String getName() const override { return name; }
     size_t getNumberOfArguments() const override { return 0; }
     bool useDefaultImplementationForConstants() const override { return true; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
     bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
     bool isVariadic() const override { return true; }
 
diff --git a/tests/queries/0_stateless/00396_uuid_v7.sql b/tests/queries/0_stateless/00396_uuid_v7.sql
index 48b8a940172..5be5a72a324 100644
--- a/tests/queries/0_stateless/00396_uuid_v7.sql
+++ b/tests/queries/0_stateless/00396_uuid_v7.sql
@@ -6,6 +6,7 @@ SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 1, 2); -- { ser
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 3); -- { serverError ARGUMENT_OUT_OF_BOUND }
 SELECT UUIDToNum('00112233-4455-6677-8899-aabbccddeeff', 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), '1'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), materialize(1)); -- { serverError ILLEGAL_COLUMN }
 
 SELECT '-- UUIDv7toDateTime --';
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
@@ -14,4 +15,5 @@ SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 1); -- {
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York', 1);  -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
 SELECT UUIDv7ToDateTime('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1');  -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/NewYork'); -- { serverError BAD_ARGUMENTS }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), materialize('America/New_York')); -- { serverError ILLEGAL_COLUMN }
 

From fccb463222cbf0d0503e453907bedae47e56bc28 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Tue, 30 Apr 2024 16:06:13 +0200
Subject: [PATCH 146/192] Rename

---
 src/Server/TCPHandler.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index a143b6da670..c0573f80850 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -348,7 +348,7 @@ void TCPHandler::runImpl()
          */
         std::unique_ptr<DB::Exception> exception;
         bool network_error = false;
-        bool close_connection = false;
+        bool user_expired = false;
         bool query_duration_already_logged = false;
         auto log_query_duration = [this, &query_duration_already_logged]()
         {
@@ -643,9 +643,9 @@ void TCPHandler::runImpl()
                 network_error = true;
 
             if (e.code() == ErrorCodes::USER_EXPIRED)
-                close_connection = true;
+                user_expired = true;
 
-            if (network_error || close_connection)
+            if (network_error || user_expired)
                 LOG_TEST(log, "Going to close connection due to exception: {}", e.message());
         }
         catch (const Poco::Net::NetException & e)
@@ -755,7 +755,7 @@ void TCPHandler::runImpl()
             session.reset();
         }
 
-        if (network_error || close_connection)
+        if (network_error || user_expired)
             break;
     }
 }

From 7854e807dd6b99faab3ac7d604fef3c52f555148 Mon Sep 17 00:00:00 2001
From: yariks5s <yaroslav.briukhovetskyi@clickhouse.com>
Date: Tue, 30 Apr 2024 14:37:07 +0000
Subject: [PATCH 147/192] init

---
 .../0_stateless/02895_npy_format.reference        |   8 ++++++++
 tests/queries/0_stateless/02895_npy_format.sh     |   4 ++++
 .../0_stateless/data_npy/big_endian_array.npy     | Bin 0 -> 140 bytes
 .../0_stateless/data_npy/none_endian_array.npy    | Bin 0 -> 131 bytes
 4 files changed, 12 insertions(+)
 create mode 100644 tests/queries/0_stateless/data_npy/big_endian_array.npy
 create mode 100644 tests/queries/0_stateless/data_npy/none_endian_array.npy

diff --git a/tests/queries/0_stateless/02895_npy_format.reference b/tests/queries/0_stateless/02895_npy_format.reference
index 76c8a7a2abf..f9e77644a35 100644
--- a/tests/queries/0_stateless/02895_npy_format.reference
+++ b/tests/queries/0_stateless/02895_npy_format.reference
@@ -26,6 +26,12 @@ c
 [0,0,0]
 [[1,2],[3,4]]
 [[5,6],[7,8]]
+1
+2
+3
+1
+2
+3
 array	Int64					
 array	Float64					
 array	String					
@@ -37,6 +43,8 @@ array	Array(String)
 array	Array(UInt8)					
 array	Array(Int64)					
 array	Array(Array(Int64))					
+array	Int8					
+array	Int32					
 1
 2
 3
diff --git a/tests/queries/0_stateless/02895_npy_format.sh b/tests/queries/0_stateless/02895_npy_format.sh
index c4fb2e2f67d..9d05303a091 100755
--- a/tests/queries/0_stateless/02895_npy_format.sh
+++ b/tests/queries/0_stateless/02895_npy_format.sh
@@ -16,6 +16,8 @@ $CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_unicode.npy')
 $CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_bool.npy')"
 $CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/two_dim_null.npy')"
 $CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/three_dim.npy')"
+$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/none_endian_array.npy')"
+$CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/big_endian_array.npy')"
 
 $CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/one_dim.npy')"
 $CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/one_dim_float.npy')"
@@ -28,6 +30,8 @@ $CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_unicode.npy')"
 $CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_bool.npy')"
 $CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/two_dim_null.npy')"
 $CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/three_dim.npy')"
+$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/none_endian_array.npy')"
+$CLICKHOUSE_LOCAL -q "describe file('$CURDIR/data_npy/big_endian_array.npy')"
 
 $CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value UInt8')"
 $CLICKHOUSE_LOCAL -q "select * from file('$CURDIR/data_npy/one_dim.npy', Npy, 'value UInt16')"
diff --git a/tests/queries/0_stateless/data_npy/big_endian_array.npy b/tests/queries/0_stateless/data_npy/big_endian_array.npy
new file mode 100644
index 0000000000000000000000000000000000000000..61c5e1ea31d9a271f1f83377af9eef4a3e91766a
GIT binary patch
literal 140
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1ZmuaG|qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
eXCxM+0{I%oI+{8PwF(pfE(Qh$Mj&PaVrBrHg&So6

literal 0
HcmV?d00001

diff --git a/tests/queries/0_stateless/data_npy/none_endian_array.npy b/tests/queries/0_stateless/data_npy/none_endian_array.npy
new file mode 100644
index 0000000000000000000000000000000000000000..1e233a4c5ed532c20c6b541fc9338657a42b713f
GIT binary patch
literal 131
zcmbR27wQ`j$;eQ~P_3SlTAW;@Zl$1JlWC}~qoAIaUsO_*m=~X4l#&V(cT3DEP6dh=
YXCxM+0{I%oI+{8PwF(pfE=DG10Kjt_pa1{>

literal 0
HcmV?d00001


From 3b7742bb8117a963f3e2d6891e3b135cfb76fa18 Mon Sep 17 00:00:00 2001
From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com>
Date: Tue, 30 Apr 2024 16:38:38 +0200
Subject: [PATCH 148/192] Update 02895_npy_format.reference


From 0e8575fc074b24e78905d69114dc4dbd3e3dcc49 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 30 Apr 2024 15:29:42 +0000
Subject: [PATCH 149/192] Remove UUIDv7ToDateTime due to memory sanitizer
 issues

---
 .../sql-reference/functions/uuid-functions.md | 47 ----------
 src/Functions/FunctionsCodingUUID.cpp         | 91 -------------------
 .../0_stateless/00396_uuid_to_num.reference   |  3 +
 ...0396_uuid_v7.sql => 00396_uuid_to_num.sql} | 10 --
 .../0_stateless/00396_uuid_v7.reference       |  5 -
 5 files changed, 3 insertions(+), 153 deletions(-)
 create mode 100644 tests/queries/0_stateless/00396_uuid_to_num.reference
 rename tests/queries/0_stateless/{00396_uuid_v7.sql => 00396_uuid_to_num.sql} (52%)
 delete mode 100644 tests/queries/0_stateless/00396_uuid_v7.reference

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index d1b833c2439..851294ff5dc 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -627,53 +627,6 @@ Result:
 └──────────────────────────────────────┴──────────────────┘
 ```
 
-## UUIDv7ToDateTime
-
-Returns the timestamp component of a UUID version 7.
-
-**Syntax**
-
-``` sql
-UUIDv7ToDateTime(uuid[, timezone])
-```
-
-**Arguments**
-
-- `uuid` — [UUID](../data-types/uuid.md) of version 7.
-- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md).
-
-**Returned value**
-
-- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000.
-
-Type: [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md).
-
-**Usage examples**
-
-``` sql
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))
-```
-
-Result:
-
-```response
-┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))─┐
-│                                          2024-04-22 15:30:29.048 │
-└──────────────────────────────────────────────────────────────────┘
-```
-
-``` sql
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
-```
-
-Result:
-
-```response
-┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')─┐
-│                                                              2024-04-22 08:30:29.048 │
-└──────────────────────────────────────────────────────────────────────────────────────┘
-```
-
 ## serverUUID()
 
 Returns the random UUID generated during the first start of the ClickHouse server. The UUID is stored in file `uuid` in the ClickHouse server directory (e.g. `/var/lib/clickhouse/`) and retained between server restarts.
diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index 76d7edf5277..ee7d88bbaf0 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -403,84 +403,6 @@ public:
     }
 };
 
-class FunctionUUIDv7ToDateTime : public IFunction
-{
-public:
-    static constexpr auto name = "UUIDv7ToDateTime";
-    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUUIDv7ToDateTime>(); }
-
-    static constexpr UInt32 datetime_scale = 3;
-
-    String getName() const override { return name; }
-    size_t getNumberOfArguments() const override { return 0; }
-    bool useDefaultImplementationForConstants() const override { return true; }
-    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
-    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
-    bool isVariadic() const override { return true; }
-
-    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
-    {
-        if (arguments.empty() || arguments.size() > 2)
-            throw Exception(
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of arguments for function {}: should be 1 or 2", getName());
-
-        if (!checkAndGetDataType<DataTypeUUID>(arguments[0].type.get()))
-        {
-            throw Exception(
-                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                "Illegal type {} of first argument of function {}, expected UUID",
-                arguments[0].type->getName(),
-                getName());
-        }
-
-        String timezone;
-        if (arguments.size() == 2)
-        {
-            timezone = extractTimeZoneNameFromColumn(arguments[1].column.get(), arguments[1].name);
-
-            if (timezone.empty())
-                throw Exception(
-                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                    "Function {} supports a 2nd argument (optional) that must be a valid time zone",
-                    getName());
-        }
-
-        return std::make_shared<DataTypeDateTime64>(datetime_scale, timezone);
-    }
-
-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
-    {
-        const ColumnWithTypeAndName & col_type_name = arguments[0];
-        const ColumnPtr & column = col_type_name.column;
-
-        if (const auto * col_in = checkAndGetColumn<ColumnUUID>(column.get()))
-        {
-            const auto & vec_in = col_in->getData();
-            const UUID * uuids = vec_in.data();
-            const size_t size = vec_in.size();
-
-            auto col_res = ColumnDateTime64::create(size, datetime_scale);
-            auto & vec_res = col_res->getData();
-
-            for (size_t i = 0; i < size; ++i)
-            {
-                uint64_t hiBytes = DB::UUIDHelpers::getHighBytes(uuids[i]);
-                if ((hiBytes & 0xf000) == 0x7000)
-                {
-                    uint64_t ms = hiBytes >> 16;
-                    vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(
-                        ms / intExp10(datetime_scale), ms % intExp10(datetime_scale), datetime_scale);
-                }
-            }
-
-            return col_res;
-        }
-        else
-            throw Exception(
-                ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
-    }
-};
-
 REGISTER_FUNCTION(CodingUUID)
 {
     factory.registerFunction<FunctionUUIDNumToString>();
@@ -501,19 +423,6 @@ This function accepts a UUID and returns a FixedString(16) as its binary represe
 )"}},
             .categories{"UUID"}},
         FunctionFactory::CaseSensitive);
-
-    factory.registerFunction<FunctionUUIDv7ToDateTime>(
-        FunctionDocumentation{
-            .description = R"(
-This function extracts the timestamp from a UUID and returns it as a DateTime64(3) typed value.
-The function expects the UUID having version 7 to be provided as the first argument.
-An optional second argument can be passed to specify a timezone for the timestamp.
-)",
-            .examples{
-                {"uuid","select UUIDv7ToDateTime(generateUUIDv7())", ""},
-                {"uuid","select generateUUIDv7() as uuid, UUIDv7ToDateTime(uuid), UUIDv7ToDateTime(uuid, 'America/New_York')", ""}},
-            .categories{"UUID"}},
-        FunctionFactory::CaseSensitive);
 }
 
 }
diff --git a/tests/queries/0_stateless/00396_uuid_to_num.reference b/tests/queries/0_stateless/00396_uuid_to_num.reference
new file mode 100644
index 00000000000..b81e0b3e48f
--- /dev/null
+++ b/tests/queries/0_stateless/00396_uuid_to_num.reference
@@ -0,0 +1,3 @@
+-- UUIDToNum --
+1
+1
diff --git a/tests/queries/0_stateless/00396_uuid_v7.sql b/tests/queries/0_stateless/00396_uuid_to_num.sql
similarity index 52%
rename from tests/queries/0_stateless/00396_uuid_v7.sql
rename to tests/queries/0_stateless/00396_uuid_to_num.sql
index 5be5a72a324..dc2457142ca 100644
--- a/tests/queries/0_stateless/00396_uuid_v7.sql
+++ b/tests/queries/0_stateless/00396_uuid_to_num.sql
@@ -7,13 +7,3 @@ SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 3); -- { server
 SELECT UUIDToNum('00112233-4455-6677-8899-aabbccddeeff', 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), '1'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), materialize(1)); -- { serverError ILLEGAL_COLUMN }
-
-SELECT '-- UUIDv7toDateTime --';
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
-SELECT UUIDv7ToDateTime(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 1); -- { serverError ILLEGAL_COLUMN }
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York', 1);  -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
-SELECT UUIDv7ToDateTime('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1');  -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/NewYork'); -- { serverError BAD_ARGUMENTS }
-SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), materialize('America/New_York')); -- { serverError ILLEGAL_COLUMN }
-
diff --git a/tests/queries/0_stateless/00396_uuid_v7.reference b/tests/queries/0_stateless/00396_uuid_v7.reference
deleted file mode 100644
index 46a80dcf19b..00000000000
--- a/tests/queries/0_stateless/00396_uuid_v7.reference
+++ /dev/null
@@ -1,5 +0,0 @@
--- UUIDToNum --
-1
-1
--- UUIDv7toDateTime --
-2024-04-22 08:30:29.048

From 0b836376d1fe212d1d113962101025ddf3f4cb7d Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 30 Apr 2024 17:41:44 +0200
Subject: [PATCH 150/192] Run azure tests in PR

---
 .github/PULL_REQUEST_TEMPLATE.md | 1 +
 tests/ci/ci.py                   | 2 +-
 tests/ci/ci_config.py            | 6 ++++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 85b1d460833..115857a42d2 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -56,6 +56,7 @@ At a minimum, the following information should be added (but add more as needed)
 - [ ] <!---ci_include_asan--> All with ASAN
 - [ ] <!---ci_include_tsan--> All with TSAN
 - [ ] <!---ci_include_analyzer--> All with Analyzer
+- [ ] <!---ci_include_azure --> All with Azure
 - [ ] <!---ci_include_KEYWORD--> Add your option here
 
 #### Exclude tests:
diff --git a/tests/ci/ci.py b/tests/ci/ci.py
index f97d5a658ad..a51ecf36145 100644
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@@ -1369,7 +1369,7 @@ def _configure_jobs(
             continue
         if job_config.pr_only and pr_info.is_release_branch:
             continue
-        if job_config.release_only and not pr_info.is_release_branch:
+        if job_config.release_only and not job_config.run_by_ci_option and not pr_info.is_release_branch:
             continue
 
         # fill job randomization buckets (for jobs with configured @random_bucket property))
diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index bdfff12db0b..28a841fe0fa 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -244,6 +244,8 @@ class JobConfig:
     pr_only: bool = False
     # job is for release/master branches only
     release_only: bool = False
+    # job will run if it's enabled in CI option
+    run_by_ci_option: bool = False
     # to randomly pick and run one job among jobs in the same @random_bucket. Applied in PR branches only.
     random_bucket: str = ""
 
@@ -1205,7 +1207,7 @@ CI_CONFIG = CIConfig(
         ),
         JobNames.STATELESS_TEST_AZURE_ASAN: TestConfig(
             Build.PACKAGE_ASAN,
-            job_config=JobConfig(num_batches=4, **statless_test_common_params, release_only=True),  # type: ignore
+            job_config=JobConfig(num_batches=4, **statless_test_common_params, release_only=True, run_by_ci_option=True),  # type: ignore
         ),
         JobNames.STATELESS_TEST_S3_TSAN: TestConfig(
             Build.PACKAGE_TSAN,
@@ -1230,7 +1232,7 @@ CI_CONFIG = CIConfig(
             Build.PACKAGE_ASAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params)  # type: ignore
         ),
         JobNames.STRESS_TEST_AZURE_TSAN: TestConfig(
-            Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params, release_only=True)  # type: ignore
+            Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True)  # type: ignore
         ),
         JobNames.UPGRADE_TEST_TSAN: TestConfig(
             Build.PACKAGE_TSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params)  # type: ignore

From 7d9f76988a45dc35721ea450e8b379afb2080ff3 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:52:58 +0000
Subject: [PATCH 151/192] Automatic style fix

---
 tests/ci/ci.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/ci/ci.py b/tests/ci/ci.py
index a51ecf36145..dcae633c18f 100644
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@@ -1369,7 +1369,11 @@ def _configure_jobs(
             continue
         if job_config.pr_only and pr_info.is_release_branch:
             continue
-        if job_config.release_only and not job_config.run_by_ci_option and not pr_info.is_release_branch:
+        if (
+            job_config.release_only
+            and not job_config.run_by_ci_option
+            and not pr_info.is_release_branch
+        ):
             continue
 
         # fill job randomization buckets (for jobs with configured @random_bucket property))

From e1de0d4f36e335d0b22beab5d6a37b1635aecbe6 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 30 Apr 2024 17:54:59 +0200
Subject: [PATCH 152/192] Fix

---
 tests/ci/ci.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/ci/ci.py b/tests/ci/ci.py
index a51ecf36145..2f296e7217b 100644
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@@ -849,6 +849,7 @@ class CiOptions:
         jobs_to_do: List[str],
         jobs_to_skip: List[str],
         jobs_params: Dict[str, Dict[str, Any]],
+        run_only_if_included: True,
     ) -> Tuple[List[str], List[str], Dict[str, Dict[str, Any]]]:
         """
         Applies specified options on CI Run Config
@@ -931,6 +932,10 @@ class CiOptions:
                 )
             jobs_to_do_requested = list(label_config.run_jobs)
 
+        if run_only_if_included and not jobs_to_do_requested:
+            jobs_to_skip += jobs_to_do
+            jobs_to_do = []
+
         if jobs_to_do_requested:
             jobs_to_do_requested = list(set(jobs_to_do_requested))
             print(
@@ -1369,7 +1374,11 @@ def _configure_jobs(
             continue
         if job_config.pr_only and pr_info.is_release_branch:
             continue
-        if job_config.release_only and not job_config.run_by_ci_option and not pr_info.is_release_branch:
+        if (
+            job_config.release_only
+            and not job_config.run_by_ci_option
+            and not pr_info.is_release_branch
+        ):
             continue
 
         # fill job randomization buckets (for jobs with configured @random_bucket property))
@@ -1421,6 +1430,7 @@ def _configure_jobs(
             jobs_params[job] = {
                 "batches": batches_to_do,
                 "num_batches": num_batches,
+                "run_if_ci_option_include_set": job_config.run_by_ci_option,
             }
         elif add_to_skip:
             # treat job as being skipped only if it's controlled by digest
@@ -1445,7 +1455,7 @@ def _configure_jobs(
                 ]
 
     jobs_to_do, jobs_to_skip, jobs_params = ci_options.apply(
-        jobs_to_do, jobs_to_skip, jobs_params
+        jobs_to_do, jobs_to_skip, jobs_params, job_config.run_by_ci_option
     )
 
     return {

From b6c49af024ecb032d6e927ed49ef1afb26ccd336 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 30 Apr 2024 17:59:08 +0200
Subject: [PATCH 153/192] Fxi

---
 tests/ci/test_ci_options.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py
index f336b917644..fa2c5a799fc 100644
--- a/tests/ci/test_ci_options.py
+++ b/tests/ci/test_ci_options.py
@@ -151,7 +151,7 @@ class TestCIOptions(unittest.TestCase):
         jobs_to_skip = []
         job_params = {}
         jobs_to_do, jobs_to_skip, job_params = ci_options.apply(
-            jobs_to_do, jobs_to_skip, job_params
+            jobs_to_do, jobs_to_skip, job_params, False
         )
         self.assertCountEqual(
             jobs_to_do,
@@ -182,7 +182,7 @@ class TestCIOptions(unittest.TestCase):
         jobs_to_skip = []
         job_params = {}
         jobs_to_do, jobs_to_skip, job_params = ci_options.apply(
-            jobs_to_do, jobs_to_skip, job_params
+            jobs_to_do, jobs_to_skip, job_params, False
         )
         self.assertCountEqual(
             jobs_to_do,

From 1c8a2ae2eb8b2f9c7dce56e1b5e78fb8d4129ea4 Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Tue, 30 Apr 2024 16:14:37 +0000
Subject: [PATCH 154/192] fix possible endless loop while reading from azure

---
 contrib/azure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/azure b/contrib/azure
index b90fd3c6ef3..6262a76ef4c 160000
--- a/contrib/azure
+++ b/contrib/azure
@@ -1 +1 @@
-Subproject commit b90fd3c6ef3185f5be3408056567bca0854129b6
+Subproject commit 6262a76ef4c4c330c84e58dd4f6f13f4e6230fcd

From be720409222ac043d3999f5159f8a529fcb6e3f2 Mon Sep 17 00:00:00 2001
From: HarryLeeIBM <hleeatwork@outlook.com>
Date: Tue, 30 Apr 2024 10:40:00 -0700
Subject: [PATCH 155/192] Use mold linker for s390x

---
 contrib/openssl-cmake/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/contrib/openssl-cmake/CMakeLists.txt b/contrib/openssl-cmake/CMakeLists.txt
index f4ad00e4402..021c88bcb04 100644
--- a/contrib/openssl-cmake/CMakeLists.txt
+++ b/contrib/openssl-cmake/CMakeLists.txt
@@ -93,7 +93,10 @@ enable_language(ASM)
 
 if(COMPILER_CLANG)
     add_definitions(-Wno-unused-command-line-argument)
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld") # only relevant for -DENABLE_OPENSSL_DYNAMIC=1
+    # Note that s390x build uses mold linker
+    if(NOT ARCH_S390X)
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld") # only relevant for -DENABLE_OPENSSL_DYNAMIC=1
+    endif()
 endif()
 
 if(ARCH_AMD64)

From bf637a863360d5c52545b4ce629076272f66e3d1 Mon Sep 17 00:00:00 2001
From: Eduard Karacharov <eduard.karacharov@semrush.com>
Date: Tue, 30 Apr 2024 20:40:27 +0300
Subject: [PATCH 156/192] include_from docs updated

---
 docs/en/operations/server-configuration-parameters/settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index 3a20c7b758b..45a6bbc33c2 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1189,7 +1189,7 @@ Expired time for HSTS in seconds. The default value is 0 means clickhouse disabl
 
 ## include_from {#include_from}
 
-The path to the file with substitutions.
+The path to the file with substitutions. Supported file formats are XML and YAML.
 
 For more information, see the section “[Configuration files](../../operations/configuration-files.md#configuration_files)”.
 

From ca0cc9e05ab2013a9d907f6523ab60c9ec453192 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 30 Apr 2024 19:46:50 +0200
Subject: [PATCH 157/192] Fix style

---
 tests/ci/ci.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci/ci.py b/tests/ci/ci.py
index 2f296e7217b..822fca3f1e0 100644
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@@ -849,7 +849,7 @@ class CiOptions:
         jobs_to_do: List[str],
         jobs_to_skip: List[str],
         jobs_params: Dict[str, Dict[str, Any]],
-        run_only_if_included: True,
+        run_only_if_included: bool,
     ) -> Tuple[List[str], List[str], Dict[str, Dict[str, Any]]]:
         """
         Applies specified options on CI Run Config

From 609d1d309e9bff404b03685982279696ede5b158 Mon Sep 17 00:00:00 2001
From: Eduard Karacharov <13005055+korowa@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:20:27 +0300
Subject: [PATCH 158/192] Update
 docs/en/operations/server-configuration-parameters/settings.md

Co-authored-by: Konstantin Bogdanov <thevar1able@users.noreply.github.com>
---
 docs/en/operations/server-configuration-parameters/settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index 45a6bbc33c2..c839ea3ae5a 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1189,7 +1189,7 @@ Expired time for HSTS in seconds. The default value is 0 means clickhouse disabl
 
 ## include_from {#include_from}
 
-The path to the file with substitutions. Supported file formats are XML and YAML.
+The path to the file with substitutions. Both XML and YAML formats are supported.
 
 For more information, see the section “[Configuration files](../../operations/configuration-files.md#configuration_files)”.
 

From dde34625f235b2203846eb0b74d7d81bd945df66 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:30:16 +0300
Subject: [PATCH 159/192] Revert "Remove UUIDv7ToDateTime due to memory
 sanitizer issues"

This reverts commit 0e8575fc074b24e78905d69114dc4dbd3e3dcc49.
---
 .../sql-reference/functions/uuid-functions.md | 47 ++++++++++
 src/Functions/FunctionsCodingUUID.cpp         | 91 +++++++++++++++++++
 .../0_stateless/00396_uuid_to_num.reference   |  3 -
 .../0_stateless/00396_uuid_v7.reference       |  5 +
 ...0396_uuid_to_num.sql => 00396_uuid_v7.sql} | 10 ++
 5 files changed, 153 insertions(+), 3 deletions(-)
 delete mode 100644 tests/queries/0_stateless/00396_uuid_to_num.reference
 create mode 100644 tests/queries/0_stateless/00396_uuid_v7.reference
 rename tests/queries/0_stateless/{00396_uuid_to_num.sql => 00396_uuid_v7.sql} (52%)

diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md
index 851294ff5dc..d1b833c2439 100644
--- a/docs/en/sql-reference/functions/uuid-functions.md
+++ b/docs/en/sql-reference/functions/uuid-functions.md
@@ -627,6 +627,53 @@ Result:
 └──────────────────────────────────────┴──────────────────┘
 ```
 
+## UUIDv7ToDateTime
+
+Returns the timestamp component of a UUID version 7.
+
+**Syntax**
+
+``` sql
+UUIDv7ToDateTime(uuid[, timezone])
+```
+
+**Arguments**
+
+- `uuid` — [UUID](../data-types/uuid.md) of version 7.
+- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000.
+
+Type: [DateTime64(3)](/docs/en/sql-reference/data-types/datetime64.md).
+
+**Usage examples**
+
+``` sql
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))
+```
+
+Result:
+
+```response
+┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))─┐
+│                                          2024-04-22 15:30:29.048 │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+``` sql
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
+```
+
+Result:
+
+```response
+┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')─┐
+│                                                              2024-04-22 08:30:29.048 │
+└──────────────────────────────────────────────────────────────────────────────────────┘
+```
+
 ## serverUUID()
 
 Returns the random UUID generated during the first start of the ClickHouse server. The UUID is stored in file `uuid` in the ClickHouse server directory (e.g. `/var/lib/clickhouse/`) and retained between server restarts.
diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index ee7d88bbaf0..76d7edf5277 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -403,6 +403,84 @@ public:
     }
 };
 
+class FunctionUUIDv7ToDateTime : public IFunction
+{
+public:
+    static constexpr auto name = "UUIDv7ToDateTime";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUUIDv7ToDateTime>(); }
+
+    static constexpr UInt32 datetime_scale = 3;
+
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 0; }
+    bool useDefaultImplementationForConstants() const override { return true; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
+    bool isVariadic() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (arguments.empty() || arguments.size() > 2)
+            throw Exception(
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Wrong number of arguments for function {}: should be 1 or 2", getName());
+
+        if (!checkAndGetDataType<DataTypeUUID>(arguments[0].type.get()))
+        {
+            throw Exception(
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Illegal type {} of first argument of function {}, expected UUID",
+                arguments[0].type->getName(),
+                getName());
+        }
+
+        String timezone;
+        if (arguments.size() == 2)
+        {
+            timezone = extractTimeZoneNameFromColumn(arguments[1].column.get(), arguments[1].name);
+
+            if (timezone.empty())
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Function {} supports a 2nd argument (optional) that must be a valid time zone",
+                    getName());
+        }
+
+        return std::make_shared<DataTypeDateTime64>(datetime_scale, timezone);
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    {
+        const ColumnWithTypeAndName & col_type_name = arguments[0];
+        const ColumnPtr & column = col_type_name.column;
+
+        if (const auto * col_in = checkAndGetColumn<ColumnUUID>(column.get()))
+        {
+            const auto & vec_in = col_in->getData();
+            const UUID * uuids = vec_in.data();
+            const size_t size = vec_in.size();
+
+            auto col_res = ColumnDateTime64::create(size, datetime_scale);
+            auto & vec_res = col_res->getData();
+
+            for (size_t i = 0; i < size; ++i)
+            {
+                uint64_t hiBytes = DB::UUIDHelpers::getHighBytes(uuids[i]);
+                if ((hiBytes & 0xf000) == 0x7000)
+                {
+                    uint64_t ms = hiBytes >> 16;
+                    vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(
+                        ms / intExp10(datetime_scale), ms % intExp10(datetime_scale), datetime_scale);
+                }
+            }
+
+            return col_res;
+        }
+        else
+            throw Exception(
+                ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
+    }
+};
+
 REGISTER_FUNCTION(CodingUUID)
 {
     factory.registerFunction<FunctionUUIDNumToString>();
@@ -423,6 +501,19 @@ This function accepts a UUID and returns a FixedString(16) as its binary represe
 )"}},
             .categories{"UUID"}},
         FunctionFactory::CaseSensitive);
+
+    factory.registerFunction<FunctionUUIDv7ToDateTime>(
+        FunctionDocumentation{
+            .description = R"(
+This function extracts the timestamp from a UUID and returns it as a DateTime64(3) typed value.
+The function expects the UUID having version 7 to be provided as the first argument.
+An optional second argument can be passed to specify a timezone for the timestamp.
+)",
+            .examples{
+                {"uuid","select UUIDv7ToDateTime(generateUUIDv7())", ""},
+                {"uuid","select generateUUIDv7() as uuid, UUIDv7ToDateTime(uuid), UUIDv7ToDateTime(uuid, 'America/New_York')", ""}},
+            .categories{"UUID"}},
+        FunctionFactory::CaseSensitive);
 }
 
 }
diff --git a/tests/queries/0_stateless/00396_uuid_to_num.reference b/tests/queries/0_stateless/00396_uuid_to_num.reference
deleted file mode 100644
index b81e0b3e48f..00000000000
--- a/tests/queries/0_stateless/00396_uuid_to_num.reference
+++ /dev/null
@@ -1,3 +0,0 @@
--- UUIDToNum --
-1
-1
diff --git a/tests/queries/0_stateless/00396_uuid_v7.reference b/tests/queries/0_stateless/00396_uuid_v7.reference
new file mode 100644
index 00000000000..46a80dcf19b
--- /dev/null
+++ b/tests/queries/0_stateless/00396_uuid_v7.reference
@@ -0,0 +1,5 @@
+-- UUIDToNum --
+1
+1
+-- UUIDv7toDateTime --
+2024-04-22 08:30:29.048
diff --git a/tests/queries/0_stateless/00396_uuid_to_num.sql b/tests/queries/0_stateless/00396_uuid_v7.sql
similarity index 52%
rename from tests/queries/0_stateless/00396_uuid_to_num.sql
rename to tests/queries/0_stateless/00396_uuid_v7.sql
index dc2457142ca..5be5a72a324 100644
--- a/tests/queries/0_stateless/00396_uuid_to_num.sql
+++ b/tests/queries/0_stateless/00396_uuid_v7.sql
@@ -7,3 +7,13 @@ SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), 3); -- { server
 SELECT UUIDToNum('00112233-4455-6677-8899-aabbccddeeff', 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), '1'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT UUIDToNum(toUUID('00112233-4455-6677-8899-aabbccddeeff'), materialize(1)); -- { serverError ILLEGAL_COLUMN }
+
+SELECT '-- UUIDv7toDateTime --';
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York');
+SELECT UUIDv7ToDateTime(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 1); -- { serverError ILLEGAL_COLUMN }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York', 1);  -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH }
+SELECT UUIDv7ToDateTime('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1');  -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/NewYork'); -- { serverError BAD_ARGUMENTS }
+SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), materialize('America/New_York')); -- { serverError ILLEGAL_COLUMN }
+

From 668c83b1cb4c1aff023e0d28cdb31780192290e7 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:38:27 +0300
Subject: [PATCH 160/192] Fix for memory sanitizer

---
 src/Functions/FunctionsCodingUUID.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index 76d7edf5277..e4f6c580ba8 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -464,13 +464,10 @@ public:
 
             for (size_t i = 0; i < size; ++i)
             {
-                uint64_t hiBytes = DB::UUIDHelpers::getHighBytes(uuids[i]);
-                if ((hiBytes & 0xf000) == 0x7000)
-                {
-                    uint64_t ms = hiBytes >> 16;
-                    vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(
-                        ms / intExp10(datetime_scale), ms % intExp10(datetime_scale), datetime_scale);
-                }
+                const uint64_t hiBytes = DB::UUIDHelpers::getHighBytes(uuids[i]);
+                const uint64_t ms = ((hiBytes & 0xf000) == 0x7000) ? (hiBytes >> 16) : 0;
+
+                vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(ms / intExp10(DATETIME_SCALE), ms % intExp10(DATETIME_SCALE), DATETIME_SCALE);
             }
 
             return col_res;

From d2070e5de4b57d991543a9e44ead0e216e2162f9 Mon Sep 17 00:00:00 2001
From: Alexey Petrunyaka <167422282+pet74alex@users.noreply.github.com>
Date: Tue, 30 Apr 2024 21:51:15 +0300
Subject: [PATCH 161/192] Update FunctionsCodingUUID.cpp

Fix for build error after fix merge error
---
 src/Functions/FunctionsCodingUUID.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Functions/FunctionsCodingUUID.cpp b/src/Functions/FunctionsCodingUUID.cpp
index e4f6c580ba8..dfef9a8a9ba 100644
--- a/src/Functions/FunctionsCodingUUID.cpp
+++ b/src/Functions/FunctionsCodingUUID.cpp
@@ -467,7 +467,7 @@ public:
                 const uint64_t hiBytes = DB::UUIDHelpers::getHighBytes(uuids[i]);
                 const uint64_t ms = ((hiBytes & 0xf000) == 0x7000) ? (hiBytes >> 16) : 0;
 
-                vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(ms / intExp10(DATETIME_SCALE), ms % intExp10(DATETIME_SCALE), DATETIME_SCALE);
+                vec_res[i] = DecimalUtils::decimalFromComponents<DateTime64>(ms / intExp10(datetime_scale), ms % intExp10(datetime_scale), datetime_scale);
             }
 
             return col_res;

From 7a63f7ca40bd48913d31d14eb6c3204b7c49f62a Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Tue, 30 Apr 2024 15:00:39 -0400
Subject: [PATCH 162/192] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a251d3a20f5..729e9b6ff5c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -348,6 +348,7 @@
 * Add sanity check for number of threads and block sizes. [#60138](https://github.com/ClickHouse/ClickHouse/pull/60138) ([Raúl Marín](https://github.com/Algunenano)).
 * Don't infer floats in exponential notation by default. Add a setting `input_format_try_infer_exponent_floats` that will restore previous behaviour (disabled by default). Closes [#59476](https://github.com/ClickHouse/ClickHouse/issues/59476). [#59500](https://github.com/ClickHouse/ClickHouse/pull/59500) ([Kruglov Pavel](https://github.com/Avogar)).
 * Allow alter operations to be surrounded by parenthesis. The emission of parentheses can be controlled by the `format_alter_operations_with_parentheses` config. By default, in formatted queries the parentheses are emitted as we store the formatted alter operations in some places as metadata (e.g.: mutations). The new syntax clarifies some of the queries where alter operations end in a list. E.g.: `ALTER TABLE x MODIFY TTL date GROUP BY a, b, DROP COLUMN c` cannot be parsed properly with the old syntax. In the new syntax the query `ALTER TABLE x (MODIFY TTL date GROUP BY a, b), (DROP COLUMN c)` is obvious. Older versions are not able to read the new syntax, therefore using the new syntax might cause issues if newer and older version of ClickHouse are mixed in a single cluster. [#59532](https://github.com/ClickHouse/ClickHouse/pull/59532) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Fix for the materialized view security issue, which allowed a user to insert into a table without required grants for that. Fix validates that the user has permission to insert not only into a materialized view but also into all underlying tables. This means that some queries, which worked before, now can fail with `Not enough privileges`. To address this problem, the release introduces a new feature of SQL security for views https://clickhouse.com/docs/en/sql-reference/statements/create/view#sql_security. [#54901](https://github.com/ClickHouse/ClickHouse/pull/54901) [#60439](https://github.com/ClickHouse/ClickHouse/pull/60439) ([pufit](https://github.com/pufit)).
 
 #### New Feature
 * Added new syntax which allows to specify definer user in View/Materialized View. This allows to execute selects/inserts from views without explicit grants for underlying tables. So, a View will encapsulate the grants. [#54901](https://github.com/ClickHouse/ClickHouse/pull/54901) [#60439](https://github.com/ClickHouse/ClickHouse/pull/60439) ([pufit](https://github.com/pufit)).

From 296527dedee0aeb9ee6ebb12d8791e3ae6601355 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 30 Apr 2024 22:07:22 +0200
Subject: [PATCH 163/192] Fxi

---
 tests/ci/ci.py              | 2 +-
 tests/ci/test_ci_options.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/ci/ci.py b/tests/ci/ci.py
index 822fca3f1e0..0bb12e54606 100644
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@@ -889,7 +889,7 @@ class CiOptions:
                             jobs_to_do_requested.append(job)
             assert (
                 jobs_to_do_requested
-            ), "Include tags are set but now job configured - Invalid tags, probably [{self.include_keywords}]"
+            ), f"Include tags are set but no job configured - Invalid tags, probably [{self.include_keywords}]"
             if JobNames.STYLE_CHECK not in jobs_to_do_requested:
                 # Style check must not be omitted
                 jobs_to_do_requested.append(JobNames.STYLE_CHECK)
diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py
index fa2c5a799fc..2b5c65a18ff 100644
--- a/tests/ci/test_ci_options.py
+++ b/tests/ci/test_ci_options.py
@@ -29,6 +29,7 @@ _TEST_BODY_1 = """
 _TEST_BODY_2 = """
 - [x] <!---ci_include_integration--> MUST include integration tests
 - [x] <!---ci_include_stateless--> MUST include stateless tests
+- [x] <!---ci_include_azure--> MUST include azure
 - [x] <!---ci_include_foo_Bar--> no action must be applied
 - [ ] <!---ci_include_bar--> no action must be applied
 - [x] <!---ci_exclude_tsan--> MUST exclude tsan
@@ -64,6 +65,7 @@ _TEST_JOB_LIST = [
     "Stateless tests (debug, s3 storage)",
     "Stateless tests (tsan, s3 storage)",
     "Stateless tests flaky check (asan)",
+    "Stateless tests (azure, asan)",
     "Stateful tests (debug)",
     "Stateful tests (release)",
     "Stateful tests (coverage)",
@@ -141,7 +143,7 @@ class TestCIOptions(unittest.TestCase):
             _TEST_BODY_2, update_from_api=False
         )
         self.assertCountEqual(
-            ci_options.include_keywords, ["integration", "foo_bar", "stateless"]
+            ci_options.include_keywords, ["integration", "foo_bar", "stateless", "azure"]
         )
         self.assertCountEqual(
             ci_options.exclude_keywords,
@@ -160,6 +162,7 @@ class TestCIOptions(unittest.TestCase):
                 "package_release",
                 "package_asan",
                 "Stateless tests (asan)",
+                "Stateless tests (azure, asan)",
                 "Stateless tests flaky check (asan)",
                 "Stateless tests (msan)",
                 "Stateless tests (ubsan)",

From e0118502d5904f841cc2c0ce0d03047ff351ee80 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Tue, 30 Apr 2024 20:17:32 +0000
Subject: [PATCH 164/192] Automatic style fix

---
 tests/ci/test_ci_options.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py
index 2b5c65a18ff..9e4f486aa29 100644
--- a/tests/ci/test_ci_options.py
+++ b/tests/ci/test_ci_options.py
@@ -143,7 +143,8 @@ class TestCIOptions(unittest.TestCase):
             _TEST_BODY_2, update_from_api=False
         )
         self.assertCountEqual(
-            ci_options.include_keywords, ["integration", "foo_bar", "stateless", "azure"]
+            ci_options.include_keywords,
+            ["integration", "foo_bar", "stateless", "azure"],
         )
         self.assertCountEqual(
             ci_options.exclude_keywords,

From e13f3e57d977d079a328e304489f9324b8199011 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Tue, 30 Apr 2024 22:16:03 +0200
Subject: [PATCH 165/192] Disable one test from 02994_sanity_check_settings

---
 .../0_stateless/02994_sanity_check_settings.reference |  2 --
 .../0_stateless/02994_sanity_check_settings.sql       | 11 ++++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/queries/0_stateless/02994_sanity_check_settings.reference b/tests/queries/0_stateless/02994_sanity_check_settings.reference
index b9992764c78..486259eae54 100644
--- a/tests/queries/0_stateless/02994_sanity_check_settings.reference
+++ b/tests/queries/0_stateless/02994_sanity_check_settings.reference
@@ -1,5 +1,3 @@
-0	0
-0	0
 1
 1
 1
diff --git a/tests/queries/0_stateless/02994_sanity_check_settings.sql b/tests/queries/0_stateless/02994_sanity_check_settings.sql
index 6aaa448403b..1a7a27ceb5d 100644
--- a/tests/queries/0_stateless/02994_sanity_check_settings.sql
+++ b/tests/queries/0_stateless/02994_sanity_check_settings.sql
@@ -12,11 +12,12 @@ SELECT
     repeat(toString(number), 5)
 FROM numbers(1);
 
-SELECT * APPLY max
-FROM data_02052_1_wide0__fuzz_48
-GROUP BY key
-WITH CUBE
-    SETTINGS max_read_buffer_size = 7, max_threads = 9223372036854775807;
+-- Disabled because even after reducing internally to "256 * getNumberOfPhysicalCPUCores()" threads it's too much for CI (or for anything running this many times in parallel)
+-- SELECT * APPLY max
+-- FROM data_02052_1_wide0__fuzz_48
+-- GROUP BY key
+-- WITH CUBE
+-- SETTINGS max_read_buffer_size = 7, max_threads = 9223372036854775807;
 
 SELECT zero + 1 AS x
 FROM system.zeros LIMIT 10

From e79860aa0a4394507e57bafc4bd293a6ec9cee15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Tue, 30 Apr 2024 22:24:46 +0200
Subject: [PATCH 166/192] Fix linter

---
 tests/clickhouse-test | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 2324fa0b396..56556050717 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -1487,7 +1487,9 @@ class TestCase:
                     body = read_file_as_binary_string(file_path)
                     clickhouse_execute(
                         args,
-                        f"INSERT INTO system.coverage_log SETTINGS async_insert=1, wait_for_async_insert=0, async_insert_busy_timeout_min_ms=200, async_insert_busy_timeout_max_ms=1000 SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary",
+                        "INSERT INTO system.coverage_log "
+                        "SETTINGS async_insert=1, wait_for_async_insert=0, async_insert_busy_timeout_min_ms=200, async_insert_busy_timeout_max_ms=1000 "
+                        f"SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary",
                         body=body,
                         retry_error_codes=True,
                     )

From 48836afcf7218079ea751c9b7f6f80f00a8c39c7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 1 May 2024 03:51:18 +0200
Subject: [PATCH 167/192] Update changelog

---
 CHANGELOG.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a251d3a20f5..06f01709906 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,7 +33,6 @@
 #### Performance Improvement
 * JOIN filter push down improvements using equivalent sets. [#61216](https://github.com/ClickHouse/ClickHouse/pull/61216) ([Maksim Kita](https://github.com/kitaisreal)).
 * Convert OUTER JOIN to INNER JOIN optimization if the filter after JOIN always filters default values. Optimization can be controlled with setting `query_plan_convert_outer_join_to_inner_join`, enabled by default. [#62907](https://github.com/ClickHouse/ClickHouse/pull/62907) ([Maksim Kita](https://github.com/kitaisreal)).
-* Enabled fast Parquet encoder by default (output_format_parquet_use_custom_encoder). [#62088](https://github.com/ClickHouse/ClickHouse/pull/62088) ([Michael Kolupaev](https://github.com/al13n321)).
 * Improvement for AWS S3. Client has to send header 'Keep-Alive: timeout=X' to the server. If a client receives a response from the server with that header, client has to use the value from the server. Also for a client it is better not to use a connection which is nearly expired in order to avoid connection close race. [#62249](https://github.com/ClickHouse/ClickHouse/pull/62249) ([Sema Checherinda](https://github.com/CheSema)).
 * Reduce overhead of the mutations for SELECTs (v2). [#60856](https://github.com/ClickHouse/ClickHouse/pull/60856) ([Azat Khuzhin](https://github.com/azat)).
 * More frequently invoked functions in PODArray are now force-inlined. [#61144](https://github.com/ClickHouse/ClickHouse/pull/61144) ([李扬](https://github.com/taiyang-li)).

From c6ab7bd9957629b13cec02f9449bae99e3a62386 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Wed, 1 May 2024 03:52:34 +0200
Subject: [PATCH 168/192] Update autogenerated version to 24.5.1.1 and
 contributors

---
 cmake/autogenerated_versions.txt              | 10 +++++-----
 .../StorageSystemContributors.generated.cpp   | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt
index 26cb0eb23c6..f8ff71876c6 100644
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@@ -2,11 +2,11 @@
 
 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
-SET(VERSION_REVISION 54485)
+SET(VERSION_REVISION 54486)
 SET(VERSION_MAJOR 24)
-SET(VERSION_MINOR 4)
+SET(VERSION_MINOR 5)
 SET(VERSION_PATCH 1)
-SET(VERSION_GITHASH 2c5c589a882ceec35439650337b92db3e76f0081)
-SET(VERSION_DESCRIBE v24.4.1.1-testing)
-SET(VERSION_STRING 24.4.1.1)
+SET(VERSION_GITHASH 6d4b31322d168356c8b10c43b4cef157c82337ff)
+SET(VERSION_DESCRIBE v24.5.1.1-testing)
+SET(VERSION_STRING 24.5.1.1)
 # end of autochange
diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp
index 6a66d8a7e19..909599c00af 100644
--- a/src/Storages/System/StorageSystemContributors.generated.cpp
+++ b/src/Storages/System/StorageSystemContributors.generated.cpp
@@ -97,6 +97,7 @@ const char * auto_contributors[] {
     "Alexey Gusev",
     "Alexey Ilyukhov",
     "Alexey Ivanov",
+    "Alexey Katsman",
     "Alexey Korepanov",
     "Alexey Milovidov",
     "Alexey Perevyshin",
@@ -156,6 +157,7 @@ const char * auto_contributors[] {
     "Andy Yang",
     "AndyB",
     "Anish Bhanwala",
+    "Anita Hammer",
     "Anmol Arora",
     "Anna",
     "Anna Shakhova",
@@ -180,6 +182,7 @@ const char * auto_contributors[] {
     "Aram Peres",
     "Ariel Robaldo",
     "Aris Tritas",
+    "Arnaud Rocher",
     "Arsen Hakobyan",
     "Arslan G",
     "ArtCorp",
@@ -252,6 +255,7 @@ const char * auto_contributors[] {
     "Carbyn",
     "Carlos Rodríguez Hernández",
     "Caspian",
+    "Chandre Van Der Westhuizen",
     "Chang Chen",
     "Chao Ma",
     "Chao Wang",
@@ -351,6 +355,7 @@ const char * auto_contributors[] {
     "Duc Canh Le",
     "DuckSoft",
     "Duyet Le",
+    "Eduard Karacharov",
     "Egor O'Sten",
     "Egor Savin",
     "Eirik",
@@ -360,6 +365,7 @@ const char * auto_contributors[] {
     "Elena Baskakova",
     "Elena Torró",
     "Elghazal Ahmed",
+    "Eliot Hautefeuille",
     "Elizaveta Mironyuk",
     "Elykov Alexandr",
     "Emmanuel Donin de Rosière",
@@ -469,6 +475,7 @@ const char * auto_contributors[] {
     "Ignat Loskutov",
     "Igor",
     "Igor Hatarist",
+    "Igor Markelov",
     "Igor Mineev",
     "Igor Nikonov",
     "Igor Strykhar",
@@ -478,6 +485,7 @@ const char * auto_contributors[] {
     "Ildar Musin",
     "Ildus Kurbangaliev",
     "Ilya",
+    "Ilya Andreev",
     "Ilya Breev",
     "Ilya Golshtein",
     "Ilya Khomutov",
@@ -531,6 +539,7 @@ const char * auto_contributors[] {
     "Jean Baptiste Favre",
     "Jeffrey Dang",
     "Jens Hoevenaars",
+    "Jhonso7393",
     "Jiading Guo",
     "Jianfei Hu",
     "Jiang Tao",
@@ -557,6 +566,8 @@ const char * auto_contributors[] {
     "Joris Clement",
     "Joris Giovannangeli",
     "Jose",
+    "Joseph Redfern",
+    "Josh Rodriguez",
     "Josh Taylor",
     "Joshua Hildred",
     "João Figueiredo",
@@ -585,6 +596,7 @@ const char * auto_contributors[] {
     "KevinyhZou",
     "KinderRiven",
     "Kiran",
+    "Kirill",
     "Kirill Danshin",
     "Kirill Ershov",
     "Kirill Malev",
@@ -605,6 +617,7 @@ const char * auto_contributors[] {
     "Korviakov Andrey",
     "Kostiantyn Storozhuk",
     "Kozlov Ivan",
+    "KrJin",
     "Krisztián Szűcs",
     "Kruglov Pavel",
     "Krzysztof Góralski",
@@ -653,6 +666,7 @@ const char * auto_contributors[] {
     "M1eyu2018",
     "MEX7",
     "MaceWindu",
+    "Maciej Bak",
     "MagiaGroz",
     "Maks Skorokhod",
     "Maksim",
@@ -766,6 +780,7 @@ const char * auto_contributors[] {
     "MovElb",
     "Mr.General",
     "Murat Kabilov",
+    "Murat Khairulin",
     "MyroTk",
     "Márcio Martins",
     "Mátyás Jani",
@@ -863,6 +878,7 @@ const char * auto_contributors[] {
     "Pavel Yakunin",
     "Pavlo Bashynskiy",
     "Pawel Rog",
+    "Paweł Kudzia",
     "Peignon Melvyn",
     "Peng Jian",
     "Peng Liu",
@@ -1069,6 +1085,7 @@ const char * auto_contributors[] {
     "Tom Risse",
     "Tomas Barton",
     "Tomáš Hromada",
+    "Tristan",
     "Tsarkova Anastasia",
     "TszkitLo40",
     "Tyler Hannan",
@@ -1342,6 +1359,7 @@ const char * auto_contributors[] {
     "dfenelonov",
     "dgrr",
     "dheerajathrey",
+    "dilet6298",
     "dimarub2000",
     "dinosaur",
     "divanik",
@@ -1552,6 +1570,7 @@ const char * auto_contributors[] {
     "lomberts",
     "loneylee",
     "long2ice",
+    "loselarry",
     "loyispa",
     "lthaooo",
     "ltrk2",

From 811ae63c0f8b2a156572a5fc3b3c4353f5698708 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Wed, 1 May 2024 02:02:57 +0000
Subject: [PATCH 169/192] Update version_date.tsv and changelogs after
 v24.4.1.2088-stable

---
 SECURITY.md                            |   3 +-
 docker/keeper/Dockerfile               |   2 +-
 docker/server/Dockerfile.alpine        |   2 +-
 docker/server/Dockerfile.ubuntu        |   2 +-
 docs/changelogs/v24.4.1.2088-stable.md | 403 +++++++++++++++++++++++++
 utils/list-versions/version_date.tsv   |   1 +
 6 files changed, 409 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelogs/v24.4.1.2088-stable.md

diff --git a/SECURITY.md b/SECURITY.md
index 4701f2ec70b..14c39129db9 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -13,9 +13,10 @@ The following versions of ClickHouse server are currently being supported with s
 
 | Version | Supported |
 |:-|:-|
+| 24.4 | ✔️ |
 | 24.3 | ✔️ |
 | 24.2 | ✔️ |
-| 24.1 | ✔️ |
+| 24.1 | ❌ |
 | 23.* | ❌ |
 | 23.8 | ✔️ |
 | 23.7 | ❌ |
diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile
index 346868e19c4..413ad2dfaed 100644
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-keeper"
 ARG DIRECT_DOWNLOAD_URLS=""
 
diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine
index 36f09c092f8..5e224b16764 100644
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 ARG DIRECT_DOWNLOAD_URLS=""
 
diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu
index 531a50efe96..7292163023d 100644
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@@ -27,7 +27,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
 
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 
 # set non-empty deb_location_url url to create a docker image
diff --git a/docs/changelogs/v24.4.1.2088-stable.md b/docs/changelogs/v24.4.1.2088-stable.md
new file mode 100644
index 00000000000..b8d83f1a31f
--- /dev/null
+++ b/docs/changelogs/v24.4.1.2088-stable.md
@@ -0,0 +1,403 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.4.1.2088-stable (6d4b31322d1) FIXME as compared to v24.3.1.2672-lts (2c5c589a882)
+
+#### Backward Incompatible Change
+* Don't allow to set max_parallel_replicas to 0 as it doesn't make sense. Setting it to 0 could lead to unexpected logical errors. Closes [#60140](https://github.com/ClickHouse/ClickHouse/issues/60140). [#61201](https://github.com/ClickHouse/ClickHouse/pull/61201) ([Kruglov Pavel](https://github.com/Avogar)).
+* `clickhouse-odbc-bridge` and `clickhouse-library-bridge` are separate packages. This closes [#61677](https://github.com/ClickHouse/ClickHouse/issues/61677). [#62114](https://github.com/ClickHouse/ClickHouse/pull/62114) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Remove support for INSERT WATCH query (part of the experimental LIVE VIEW feature). [#62382](https://github.com/ClickHouse/ClickHouse/pull/62382) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Remove optimize_monotonous_functions_in_order_by setting. [#63004](https://github.com/ClickHouse/ClickHouse/pull/63004) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### New Feature
+* Supports dropping multiple tables at the same time like `drop table a,b,c`;. [#58705](https://github.com/ClickHouse/ClickHouse/pull/58705) ([zhongyuankai](https://github.com/zhongyuankai)).
+* Table engine is grantable now, and it won't affect existing users behavior. [#60117](https://github.com/ClickHouse/ClickHouse/pull/60117) ([jsc0218](https://github.com/jsc0218)).
+* Added a rewritable S3 disk which supports INSERT operations and does not require locally stored metadata. [#61116](https://github.com/ClickHouse/ClickHouse/pull/61116) ([Julia Kartseva](https://github.com/jkartseva)).
+* For convenience purpose, `SELECT * FROM numbers() `will work in the same way as `SELECT * FROM system.numbers` - without a limit. [#61969](https://github.com/ClickHouse/ClickHouse/pull/61969) ([YenchangChan](https://github.com/YenchangChan)).
+* Modifying memory table settings through `ALTER MODIFY SETTING` is now supported. ``` ALTER TABLE memory MODIFY SETTING min_rows_to_keep = 100, max_rows_to_keep = 1000; ```. [#62039](https://github.com/ClickHouse/ClickHouse/pull/62039) ([zhongyuankai](https://github.com/zhongyuankai)).
+* Analyzer support recursive CTEs. [#62074](https://github.com/ClickHouse/ClickHouse/pull/62074) ([Maksim Kita](https://github.com/kitaisreal)).
+* Analyzer support QUALIFY clause. Closes [#47819](https://github.com/ClickHouse/ClickHouse/issues/47819). [#62619](https://github.com/ClickHouse/ClickHouse/pull/62619) ([Maksim Kita](https://github.com/kitaisreal)).
+* Added `role` query parameter to the HTTP interface. It works similarly to `SET ROLE x`, applying the role before the statement is executed. This allows for overcoming the limitation of the HTTP interface, as multiple statements are not allowed, and it is not possible to send both `SET ROLE x` and the statement itself at the same time. It is possible to set multiple roles that way, e.g., `?role=x&role=y`, which will be an equivalent of `SET ROLE x, y`. [#62669](https://github.com/ClickHouse/ClickHouse/pull/62669) ([Serge Klochkov](https://github.com/slvrtrn)).
+* Add `SYSTEM UNLOAD PRIMARY KEY`. [#62738](https://github.com/ClickHouse/ClickHouse/pull/62738) ([Pablo Marcos](https://github.com/pamarcos)).
+
+#### Performance Improvement
+* Reduce overhead of the mutations for SELECTs (v2). [#60856](https://github.com/ClickHouse/ClickHouse/pull/60856) ([Azat Khuzhin](https://github.com/azat)).
+* More frequently invoked functions in PODArray are now force-inlined. [#61144](https://github.com/ClickHouse/ClickHouse/pull/61144) ([李扬](https://github.com/taiyang-li)).
+* JOIN filter push down improvements using equivalent sets. [#61216](https://github.com/ClickHouse/ClickHouse/pull/61216) ([Maksim Kita](https://github.com/kitaisreal)).
+* Enabled fast Parquet encoder by default (output_format_parquet_use_custom_encoder). [#62088](https://github.com/ClickHouse/ClickHouse/pull/62088) ([Michael Kolupaev](https://github.com/al13n321)).
+* ... When all required fields are read, skip all remaining fields directly which can save a lot of comparison. [#62210](https://github.com/ClickHouse/ClickHouse/pull/62210) ([lgbo](https://github.com/lgbo-ustc)).
+* Functions `splitByChar` and `splitByRegexp` were speed up significantly. [#62392](https://github.com/ClickHouse/ClickHouse/pull/62392) ([李扬](https://github.com/taiyang-li)).
+* Improve trivial insert select from files in file/s3/hdfs/url/... table functions. Add separate max_parsing_threads setting to control the number of threads used in parallel parsing. [#62404](https://github.com/ClickHouse/ClickHouse/pull/62404) ([Kruglov Pavel](https://github.com/Avogar)).
+* Support parallel write buffer for AzureBlobStorage managed by setting `azure_allow_parallel_part_upload`. [#62534](https://github.com/ClickHouse/ClickHouse/pull/62534) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Functions `to_utc_timestamp` and `from_utc_timestamp` are now about 2x faster. [#62583](https://github.com/ClickHouse/ClickHouse/pull/62583) ([KevinyhZou](https://github.com/KevinyhZou)).
+* Functions `parseDateTimeOrNull`, `parseDateTimeOrZero`, `parseDateTimeInJodaSyntaxOrNull` and `parseDateTimeInJodaSyntaxOrZero` now run significantly faster (10x - 1000x) when the input contains mostly non-parseable values. [#62634](https://github.com/ClickHouse/ClickHouse/pull/62634) ([LiuNeng](https://github.com/liuneng1994)).
+* SELECTs against `system.query_cache` are now noticeably faster when the query cache contains lots of entries (e.g. more than 100.000). [#62671](https://github.com/ClickHouse/ClickHouse/pull/62671) ([Robert Schulze](https://github.com/rschu1ze)).
+* QueryPlan convert OUTER JOIN to INNER JOIN optimization if filter after JOIN always filters default values. Optimization can be controlled with setting `query_plan_convert_outer_join_to_inner_join`, enabled by default. [#62907](https://github.com/ClickHouse/ClickHouse/pull/62907) ([Maksim Kita](https://github.com/kitaisreal)).
+* Enable optimize_rewrite_sum_if_to_count_if by default. [#62929](https://github.com/ClickHouse/ClickHouse/pull/62929) ([Raúl Marín](https://github.com/Algunenano)).
+
+#### Improvement
+* Introduce separate consumer/producer tags for the Kafka configuration. This avoids warnings from librdkafka that consumer properties were specified for producer instances and vice versa (e.g. `Configuration property session.timeout.ms is a consumer property and will be ignored by this producer instance`). Closes: [#58983](https://github.com/ClickHouse/ClickHouse/issues/58983). [#58956](https://github.com/ClickHouse/ClickHouse/pull/58956) ([Aleksandr Musorin](https://github.com/AVMusorin)).
+* Added `value1`, `value2`, ..., `value10` columns to `system.text_log`. These columns contain values that were used to format the message. [#59619](https://github.com/ClickHouse/ClickHouse/pull/59619) ([Alexey Katsman](https://github.com/alexkats)).
+* Add a setting `first_day_of_week` which affects the first day of the week considered by functions `toStartOfInterval(..., INTERVAL ... WEEK)`. This allows for consistency with function `toStartOfWeek` which defaults to Sunday as the first day of the week. [#60598](https://github.com/ClickHouse/ClickHouse/pull/60598) ([Jordi Villar](https://github.com/jrdi)).
+* Added persistent virtual column `_block_offset` which stores original number of row in block that was assigned at insert. Persistence of column `_block_offset` can be enabled by setting `enable_block_offset_column`. Added virtual column`_part_data_version` which contains either min block number or mutation version of part. Persistent virtual column `_block_number` is not considered experimental anymore. [#60676](https://github.com/ClickHouse/ClickHouse/pull/60676) ([Anton Popov](https://github.com/CurtizJ)).
+* Less contention in filesystem cache (part 3): execute removal from filesystem without lock on space reservation attempt. [#61163](https://github.com/ClickHouse/ClickHouse/pull/61163) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Functions `date_diff` and `age` now calculate their result at nanosecond instead of microsecond precision. They now also offer `nanosecond` (or `nanoseconds` or `ns`) as a possible value for the `unit` parameter. [#61409](https://github.com/ClickHouse/ClickHouse/pull/61409) ([Austin Kothig](https://github.com/kothiga)).
+* Now marks are not loaded for wide parts during merges. [#61551](https://github.com/ClickHouse/ClickHouse/pull/61551) ([Anton Popov](https://github.com/CurtizJ)).
+* Reload certificate chain during certificate reload. [#61671](https://github.com/ClickHouse/ClickHouse/pull/61671) ([Pervakov Grigorii](https://github.com/GrigoryPervakov)).
+* Speed up dynamic resize of filesystem cache. [#61723](https://github.com/ClickHouse/ClickHouse/pull/61723) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Add `TRUNCATE ALL TABLES`. [#61862](https://github.com/ClickHouse/ClickHouse/pull/61862) ([豪肥肥](https://github.com/HowePa)).
+* Try to prevent [#60432](https://github.com/ClickHouse/ClickHouse/issues/60432) by not allowing a table to be attached if there is an active replica for that replica path. [#61876](https://github.com/ClickHouse/ClickHouse/pull/61876) ([Arthur Passos](https://github.com/arthurpassos)).
+* Add a setting `input_format_json_throw_on_bad_escape_sequence`, disabling it allows saving bad escape sequences in JSON input formats. [#61889](https://github.com/ClickHouse/ClickHouse/pull/61889) ([Kruglov Pavel](https://github.com/Avogar)).
+* Userspace page cache works with static web storage (`disk(type = web)`) now. Use client setting `use_page_cache_for_disks_without_file_cache=1` to enable. [#61911](https://github.com/ClickHouse/ClickHouse/pull/61911) ([Michael Kolupaev](https://github.com/al13n321)).
+* Implement input() for clickhouse-local. [#61923](https://github.com/ClickHouse/ClickHouse/pull/61923) ([Azat Khuzhin](https://github.com/azat)).
+* Fix logical-error when undoing quorum insert transaction. [#61953](https://github.com/ClickHouse/ClickHouse/pull/61953) ([Han Fei](https://github.com/hanfei1991)).
+* StorageJoin with strictness `ANY` is consistent after reload. When several rows with the same key are inserted, the first one will have higher priority (before, it was chosen randomly upon table loading). close [#51027](https://github.com/ClickHouse/ClickHouse/issues/51027). [#61972](https://github.com/ClickHouse/ClickHouse/pull/61972) ([vdimir](https://github.com/vdimir)).
+* Automatically infer Nullable column types from Apache Arrow schema. [#61984](https://github.com/ClickHouse/ClickHouse/pull/61984) ([Maksim Kita](https://github.com/kitaisreal)).
+* Allow to cancel parallel merge of aggregate states during aggregation. Example: `uniqExact`. [#61992](https://github.com/ClickHouse/ClickHouse/pull/61992) ([Maksim Kita](https://github.com/kitaisreal)).
+* Don't treat Bool and number variants as suspicious in Variant type. [#61999](https://github.com/ClickHouse/ClickHouse/pull/61999) ([Kruglov Pavel](https://github.com/Avogar)).
+* Use `system.keywords` to fill in the suggestions and also use them in the all places internally. [#62000](https://github.com/ClickHouse/ClickHouse/pull/62000) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Implement better conversion from String to Variant using parsing. [#62005](https://github.com/ClickHouse/ClickHouse/pull/62005) ([Kruglov Pavel](https://github.com/Avogar)).
+* Support Variant in JSONExtract functions. [#62014](https://github.com/ClickHouse/ClickHouse/pull/62014) ([Kruglov Pavel](https://github.com/Avogar)).
+* Dictionary source with `INVALIDATE_QUERY` is not reloaded twice on startup. [#62050](https://github.com/ClickHouse/ClickHouse/pull/62050) ([vdimir](https://github.com/vdimir)).
+* `OPTIMIZE FINAL` for `ReplicatedMergeTree` now will wait for currently active merges to finish and then reattempt to schedule a final merge. This will put it more in line with ordinary `MergeTree` behaviour. [#62067](https://github.com/ClickHouse/ClickHouse/pull/62067) ([Nikita Taranov](https://github.com/nickitat)).
+* While read data from a hive text file, it would use the first line of hive text file to resize of number of input fields, and sometimes the fields number of first line is not matched with the hive table defined , such as the hive table is defined to have 3 columns, like `test_tbl(a Int32, b Int32, c Int32)`, but the first line of text file only has 2 fields, and in this suitation, the input fields will be resized to 2, and if the next line of the text file has 3 fields, then the third field can not be read but set a default value 0, which is not right. [#62086](https://github.com/ClickHouse/ClickHouse/pull/62086) ([KevinyhZou](https://github.com/KevinyhZou)).
+* CREATE AS copies the comment. [#62117](https://github.com/ClickHouse/ClickHouse/pull/62117) ([Pablo Marcos](https://github.com/pamarcos)).
+* The syntax highlighting while typing in the client will work on the syntax level (previously, it worked on the lexer level). [#62123](https://github.com/ClickHouse/ClickHouse/pull/62123) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix an issue where when a redundant `= 1` or `= 0` is added after a boolean expression involving the primary key, the primary index is not used. For example, both `SELECT * FROM <table> WHERE <primary-key> IN (<value>) = 1` and `SELECT * FROM <table> WHERE <primary-key> NOT IN (<value>) = 0` will both perform a full table scan, when the primary index can be used. [#62142](https://github.com/ClickHouse/ClickHouse/pull/62142) ([josh-hildred](https://github.com/josh-hildred)).
+* Add query progress to table zookeeper. [#62152](https://github.com/ClickHouse/ClickHouse/pull/62152) ([JackyWoo](https://github.com/JackyWoo)).
+* Add ability to turn on trace collector (Real and CPU) server-wide. [#62189](https://github.com/ClickHouse/ClickHouse/pull/62189) ([alesapin](https://github.com/alesapin)).
+* Added setting `lightweight_deletes_sync` (default value: 2 - wait all replicas synchronously). It is similar to setting `mutations_sync` but affects only behaviour of lightweight deletes. [#62195](https://github.com/ClickHouse/ClickHouse/pull/62195) ([Anton Popov](https://github.com/CurtizJ)).
+* Distinguish booleans and integers while parsing values for custom settings: ``` SET custom_a = true; SET custom_b = 1; ```. [#62206](https://github.com/ClickHouse/ClickHouse/pull/62206) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Support S3 access through AWS Private Link Interface endpoints. Closes [#60021](https://github.com/ClickHouse/ClickHouse/issues/60021), [#31074](https://github.com/ClickHouse/ClickHouse/issues/31074) and [#53761](https://github.com/ClickHouse/ClickHouse/issues/53761). [#62208](https://github.com/ClickHouse/ClickHouse/pull/62208) ([Arthur Passos](https://github.com/arthurpassos)).
+* Client has to send header 'Keep-Alive: timeout=X' to the server. If a client receives a response from the server with that header, client has to use the value from the server. Also for a client it is better not to use a connection which is nearly expired in order to avoid connection close race. [#62249](https://github.com/ClickHouse/ClickHouse/pull/62249) ([Sema Checherinda](https://github.com/CheSema)).
+* Added nano- micro- milliseconds unit for date_trunc. [#62335](https://github.com/ClickHouse/ClickHouse/pull/62335) ([Misz606](https://github.com/Misz606)).
+* Do not create a directory for UDF in clickhouse-client if it does not exist. This closes [#59597](https://github.com/ClickHouse/ClickHouse/issues/59597). [#62366](https://github.com/ClickHouse/ClickHouse/pull/62366) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* The query cache now no longer caches results of queries against system tables (`system.*`, `information_schema.*`, `INFORMATION_SCHEMA.*`). [#62376](https://github.com/ClickHouse/ClickHouse/pull/62376) ([Robert Schulze](https://github.com/rschu1ze)).
+* `MOVE PARTITION TO TABLE` query can be delayed or can throw `TOO_MANY_PARTS` exception to avoid exceeding limits on the part count. The same settings and limits are applied as for the`INSERT` query (see `max_parts_in_total`, `parts_to_delay_insert`, `parts_to_throw_insert`, `inactive_parts_to_throw_insert`, `inactive_parts_to_delay_insert`, `max_avg_part_size_for_too_many_parts`, `min_delay_to_insert_ms` and `max_delay_to_insert` settings). [#62420](https://github.com/ClickHouse/ClickHouse/pull/62420) ([Sergei Trifonov](https://github.com/serxa)).
+* Added the missing `hostname` column to system table `blob_storage_log`. [#62456](https://github.com/ClickHouse/ClickHouse/pull/62456) ([Jayme Bird](https://github.com/jaymebrd)).
+* Changed the default installation directory on macOS from `/usr/bin` to `/usr/local/bin`. This is necessary because Apple's System Integrity Protection introduced with macOS El Capitan (2015) prevents writing into `/usr/bin`, even with `sudo`. [#62489](https://github.com/ClickHouse/ClickHouse/pull/62489) ([haohang](https://github.com/yokofly)).
+* Make transform always return the first match. [#62518](https://github.com/ClickHouse/ClickHouse/pull/62518) ([Raúl Marín](https://github.com/Algunenano)).
+* For consistency with other system tables, `system.backup_log` now has a column `event_time`. [#62541](https://github.com/ClickHouse/ClickHouse/pull/62541) ([Jayme Bird](https://github.com/jaymebrd)).
+* Avoid evaluating table DEFAULT expressions while executing `RESTORE`. [#62601](https://github.com/ClickHouse/ClickHouse/pull/62601) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Return stream of chunks from `system.remote_data_paths` instead of accumulating the whole result in one big chunk. This allows to consume less memory, show intermediate progress and cancel the query. [#62613](https://github.com/ClickHouse/ClickHouse/pull/62613) ([Alexander Gololobov](https://github.com/davenger)).
+* S3 storage and backups also need the same default keep alive settings as s3 disk. [#62648](https://github.com/ClickHouse/ClickHouse/pull/62648) ([Sema Checherinda](https://github.com/CheSema)).
+* Table `system.backup_log` now has the "default" sorting key which is `event_date, event_time`, the same as for other `_log` table engines. [#62667](https://github.com/ClickHouse/ClickHouse/pull/62667) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Mark type Variant as comparable so it can be used in primary key. [#62693](https://github.com/ClickHouse/ClickHouse/pull/62693) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add librdkafka's client identifier to log messages to be able to differentiate log messages from different consumers of a single table. [#62813](https://github.com/ClickHouse/ClickHouse/pull/62813) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Allow special macros {uuid} and {database} in a Replicated database ZooKeeper path. [#62818](https://github.com/ClickHouse/ClickHouse/pull/62818) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Allow quota key with different auth scheme in HTTP requests. [#62842](https://github.com/ClickHouse/ClickHouse/pull/62842) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Remove experimental tag from Replicated database engine. Now it is in Beta stage. [#62937](https://github.com/ClickHouse/ClickHouse/pull/62937) ([Justin de Guzman](https://github.com/justindeguzman)).
+* Reduce the verbosity of command line argument `--help` in `clickhouse client` and `clickhouse local`. The previous output is now generated by `--help --verbose`. [#62973](https://github.com/ClickHouse/ClickHouse/pull/62973) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+* Close session if [user's `valid_until`](https://clickhouse.com/docs/en/sql-reference/statements/create/user#valid-until-clause) is reached. [#63046](https://github.com/ClickHouse/ClickHouse/pull/63046) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* `log_bin_use_v1_row_events` was removed in MySQL 8.3, fix [#60479](https://github.com/ClickHouse/ClickHouse/issues/60479). [#63101](https://github.com/ClickHouse/ClickHouse/pull/63101) ([Eugene Klimov](https://github.com/Slach)).
+
+#### Build/Testing/Packaging Improvement
+* Ignore DROP queries in stress test with 1/2 probability, use TRUNCATE instead of ignoring DROP in upgrade check for Memory/JOIN tables. [#61476](https://github.com/ClickHouse/ClickHouse/pull/61476) ([Kruglov Pavel](https://github.com/Avogar)).
+* Remove from the Keeper Docker image the volumes at /etc/clickhouse-keeper and /var/log/clickhouse-keeper. [#61683](https://github.com/ClickHouse/ClickHouse/pull/61683) ([Tristan](https://github.com/Tristan971)).
+* Timeout was updated in https://github.com/ClickHouse/ClickHouse/pull/45765, but exception message was not. [#62139](https://github.com/ClickHouse/ClickHouse/pull/62139) ([Arthur Passos](https://github.com/arthurpassos)).
+* Add tests for all issues which are no longer relevant with Analyzer being enabled by default. Closes: [#55794](https://github.com/ClickHouse/ClickHouse/issues/55794) Closes: [#49472](https://github.com/ClickHouse/ClickHouse/issues/49472) Closes: [#44414](https://github.com/ClickHouse/ClickHouse/issues/44414) Closes: [#13843](https://github.com/ClickHouse/ClickHouse/issues/13843) Closes: [#55803](https://github.com/ClickHouse/ClickHouse/issues/55803) Closes: [#48308](https://github.com/ClickHouse/ClickHouse/issues/48308) Closes: [#45535](https://github.com/ClickHouse/ClickHouse/issues/45535) Closes: [#44365](https://github.com/ClickHouse/ClickHouse/issues/44365) Closes: [#44153](https://github.com/ClickHouse/ClickHouse/issues/44153) Closes: [#42399](https://github.com/ClickHouse/ClickHouse/issues/42399) Closes: [#27115](https://github.com/ClickHouse/ClickHouse/issues/27115) Closes: [#23162](https://github.com/ClickHouse/ClickHouse/issues/23162) Closes: [#15395](https://github.com/ClickHouse/ClickHouse/issues/15395) Closes: [#15411](https://github.com/ClickHouse/ClickHouse/issues/15411) Closes: [#14978](https://github.com/ClickHouse/ClickHouse/issues/14978) Closes: [#17319](https://github.com/ClickHouse/ClickHouse/issues/17319) Closes: [#11813](https://github.com/ClickHouse/ClickHouse/issues/11813) Closes: [#13210](https://github.com/ClickHouse/ClickHouse/issues/13210) Closes: [#23053](https://github.com/ClickHouse/ClickHouse/issues/23053) Closes: [#37729](https://github.com/ClickHouse/ClickHouse/issues/37729) Closes: [#32639](https://github.com/ClickHouse/ClickHouse/issues/32639) Closes: [#9954](https://github.com/ClickHouse/ClickHouse/issues/9954) Closes: [#41964](https://github.com/ClickHouse/ClickHouse/issues/41964) Closes: [#54317](https://github.com/ClickHouse/ClickHouse/issues/54317) Closes: [#7520](https://github.com/ClickHouse/ClickHouse/issues/7520) Closes: [#36973](https://github.com/ClickHouse/ClickHouse/issues/36973) Closes: [#40955](https://github.com/ClickHouse/ClickHouse/issues/40955) Closes: [#19687](https://github.com/ClickHouse/ClickHouse/issues/19687) Closes: [#23104](https://github.com/ClickHouse/ClickHouse/issues/23104) Closes: [#21584](https://github.com/ClickHouse/ClickHouse/issues/21584) Closes: [#23344](https://github.com/ClickHouse/ClickHouse/issues/23344) Closes: [#22627](https://github.com/ClickHouse/ClickHouse/issues/22627) Closes: [#10276](https://github.com/ClickHouse/ClickHouse/issues/10276) Closes: [#19687](https://github.com/ClickHouse/ClickHouse/issues/19687) Closes: [#4567](https://github.com/ClickHouse/ClickHouse/issues/4567) Closes: [#17710](https://github.com/ClickHouse/ClickHouse/issues/17710) Closes: [#11068](https://github.com/ClickHouse/ClickHouse/issues/11068) Closes: [#24395](https://github.com/ClickHouse/ClickHouse/issues/24395) Closes: [#23416](https://github.com/ClickHouse/ClickHouse/issues/23416) Closes: [#23162](https://github.com/ClickHouse/ClickHouse/issues/23162) Closes: [#25655](https://github.com/ClickHouse/ClickHouse/issues/25655) Closes: [#11757](https://github.com/ClickHouse/ClickHouse/issues/11757) Closes: [#6571](https://github.com/ClickHouse/ClickHouse/issues/6571) Closes: [#4432](https://github.com/ClickHouse/ClickHouse/issues/4432) Closes: [#8259](https://github.com/ClickHouse/ClickHouse/issues/8259) Closes: [#9233](https://github.com/ClickHouse/ClickHouse/issues/9233) Closes: [#14699](https://github.com/ClickHouse/ClickHouse/issues/14699) Closes: [#27068](https://github.com/ClickHouse/ClickHouse/issues/27068) Closes: [#28687](https://github.com/ClickHouse/ClickHouse/issues/28687) Closes: [#28777](https://github.com/ClickHouse/ClickHouse/issues/28777) Closes: [#29734](https://github.com/ClickHouse/ClickHouse/issues/29734) Closes: [#61238](https://github.com/ClickHouse/ClickHouse/issues/61238) Closes: [#33825](https://github.com/ClickHouse/ClickHouse/issues/33825) Closes: [#35608](https://github.com/ClickHouse/ClickHouse/issues/35608) Closes: [#29838](https://github.com/ClickHouse/ClickHouse/issues/29838) Closes: [#35652](https://github.com/ClickHouse/ClickHouse/issues/35652) Closes: [#36189](https://github.com/ClickHouse/ClickHouse/issues/36189) Closes: [#39634](https://github.com/ClickHouse/ClickHouse/issues/39634) Closes: [#47432](https://github.com/ClickHouse/ClickHouse/issues/47432) Closes: [#54910](https://github.com/ClickHouse/ClickHouse/issues/54910) Closes: [#57321](https://github.com/ClickHouse/ClickHouse/issues/57321) Closes: [#59154](https://github.com/ClickHouse/ClickHouse/issues/59154) Closes: [#61014](https://github.com/ClickHouse/ClickHouse/issues/61014) Closes: [#61950](https://github.com/ClickHouse/ClickHouse/issues/61950) Closes: [#55647](https://github.com/ClickHouse/ClickHouse/issues/55647) Closes: [#61947](https://github.com/ClickHouse/ClickHouse/issues/61947). [#62185](https://github.com/ClickHouse/ClickHouse/pull/62185) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Vendor in rust dependencies. [#62297](https://github.com/ClickHouse/ClickHouse/pull/62297) ([Raúl Marín](https://github.com/Algunenano)).
+* Add more tests from issues which are no longer relevant or fixed by analyzer. Closes: [#58985](https://github.com/ClickHouse/ClickHouse/issues/58985) Closes: [#59549](https://github.com/ClickHouse/ClickHouse/issues/59549) Closes: [#36963](https://github.com/ClickHouse/ClickHouse/issues/36963) Closes: [#39453](https://github.com/ClickHouse/ClickHouse/issues/39453) Closes: [#56521](https://github.com/ClickHouse/ClickHouse/issues/56521) Closes: [#47552](https://github.com/ClickHouse/ClickHouse/issues/47552) Closes: [#56503](https://github.com/ClickHouse/ClickHouse/issues/56503) Closes: [#59101](https://github.com/ClickHouse/ClickHouse/issues/59101) Closes: [#50271](https://github.com/ClickHouse/ClickHouse/issues/50271) Closes: [#54954](https://github.com/ClickHouse/ClickHouse/issues/54954) Closes: [#56466](https://github.com/ClickHouse/ClickHouse/issues/56466) Closes: [#11000](https://github.com/ClickHouse/ClickHouse/issues/11000) Closes: [#10894](https://github.com/ClickHouse/ClickHouse/issues/10894) Closes: https://github.com/ClickHouse/ClickHouse/issues/448 Closes: [#8030](https://github.com/ClickHouse/ClickHouse/issues/8030) Closes: [#32139](https://github.com/ClickHouse/ClickHouse/issues/32139) Closes: [#47288](https://github.com/ClickHouse/ClickHouse/issues/47288) Closes: [#50705](https://github.com/ClickHouse/ClickHouse/issues/50705) Closes: [#54511](https://github.com/ClickHouse/ClickHouse/issues/54511) Closes: [#55466](https://github.com/ClickHouse/ClickHouse/issues/55466) Closes: [#58500](https://github.com/ClickHouse/ClickHouse/issues/58500) Closes: [#39923](https://github.com/ClickHouse/ClickHouse/issues/39923) Closes: [#39855](https://github.com/ClickHouse/ClickHouse/issues/39855) Closes: [#4596](https://github.com/ClickHouse/ClickHouse/issues/4596) Closes: [#47422](https://github.com/ClickHouse/ClickHouse/issues/47422) Closes: [#33000](https://github.com/ClickHouse/ClickHouse/issues/33000) Closes: [#14739](https://github.com/ClickHouse/ClickHouse/issues/14739) Closes: [#44039](https://github.com/ClickHouse/ClickHouse/issues/44039) Closes: [#8547](https://github.com/ClickHouse/ClickHouse/issues/8547) Closes: [#22923](https://github.com/ClickHouse/ClickHouse/issues/22923) Closes: [#23865](https://github.com/ClickHouse/ClickHouse/issues/23865) Closes: [#29748](https://github.com/ClickHouse/ClickHouse/issues/29748) Closes: [#4222](https://github.com/ClickHouse/ClickHouse/issues/4222). [#62457](https://github.com/ClickHouse/ClickHouse/pull/62457) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Fixed build errors when OpenSSL is linked dynamically (note: this is generally unsupported and only required for s390x platforms). [#62888](https://github.com/ClickHouse/ClickHouse/pull/62888) ([Harry Lee](https://github.com/HarryLeeIBM)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Fix parser error when using COUNT(*) with FILTER clause [#61357](https://github.com/ClickHouse/ClickHouse/pull/61357) ([Duc Canh Le](https://github.com/canhld94)).
+* Fix logical error in group_by_use_nulls + grouping set + analyzer + materialize/constant [#61567](https://github.com/ClickHouse/ClickHouse/pull/61567) ([Kruglov Pavel](https://github.com/Avogar)).
+* Cancel merges before removing moved parts [#61610](https://github.com/ClickHouse/ClickHouse/pull/61610) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Try to fix abort in arrow [#61720](https://github.com/ClickHouse/ClickHouse/pull/61720) ([Kruglov Pavel](https://github.com/Avogar)).
+* Search for convert_to_replicated flag at the correct path [#61769](https://github.com/ClickHouse/ClickHouse/pull/61769) ([Kirill](https://github.com/kirillgarbar)).
+* Fix possible connections data-race for distributed_foreground_insert/distributed_background_insert_batch [#61867](https://github.com/ClickHouse/ClickHouse/pull/61867) ([Azat Khuzhin](https://github.com/azat)).
+* Mark CANNOT_PARSE_ESCAPE_SEQUENCE error as parse error to be able to skip it in row input formats [#61883](https://github.com/ClickHouse/ClickHouse/pull/61883) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix writing exception message in output format in HTTP when http_wait_end_of_query is used [#61951](https://github.com/ClickHouse/ClickHouse/pull/61951) ([Kruglov Pavel](https://github.com/Avogar)).
+* Proper fix for LowCardinality together with JSONExtact functions [#61957](https://github.com/ClickHouse/ClickHouse/pull/61957) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Crash in Engine Merge if Row Policy does not have expression [#61971](https://github.com/ClickHouse/ClickHouse/pull/61971) ([Ilya Golshtein](https://github.com/ilejn)).
+* Fix WriteBufferAzureBlobStorage destructor uncaught exception [#61988](https://github.com/ClickHouse/ClickHouse/pull/61988) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Fix CREATE TABLE w/o columns definition for ReplicatedMergeTree [#62040](https://github.com/ClickHouse/ClickHouse/pull/62040) ([Azat Khuzhin](https://github.com/azat)).
+* Fix optimize_skip_unused_shards_rewrite_in for composite sharding key [#62047](https://github.com/ClickHouse/ClickHouse/pull/62047) ([Azat Khuzhin](https://github.com/azat)).
+* ReadWriteBufferFromHTTP set right header host when redirected [#62068](https://github.com/ClickHouse/ClickHouse/pull/62068) ([Sema Checherinda](https://github.com/CheSema)).
+* Fix external table cannot parse data type Bool [#62115](https://github.com/ClickHouse/ClickHouse/pull/62115) ([Duc Canh Le](https://github.com/canhld94)).
+* Revert "Merge pull request [#61564](https://github.com/ClickHouse/ClickHouse/issues/61564) from liuneng1994/optimize_in_single_value" [#62135](https://github.com/ClickHouse/ClickHouse/pull/62135) ([Raúl Marín](https://github.com/Algunenano)).
+* Add test for [#35215](https://github.com/ClickHouse/ClickHouse/issues/35215) [#62180](https://github.com/ClickHouse/ClickHouse/pull/62180) ([Raúl Marín](https://github.com/Algunenano)).
+* Analyzer: Fix query parameter resolution [#62186](https://github.com/ClickHouse/ClickHouse/pull/62186) ([Dmitry Novik](https://github.com/novikd)).
+* Fix restoring parts while readonly [#62207](https://github.com/ClickHouse/ClickHouse/pull/62207) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix crash in index definition containing sql udf [#62225](https://github.com/ClickHouse/ClickHouse/pull/62225) ([vdimir](https://github.com/vdimir)).
+* Fixing NULL random seed for generateRandom with analyzer. [#62248](https://github.com/ClickHouse/ClickHouse/pull/62248) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Correctly handle const columns in DistinctTransfom [#62250](https://github.com/ClickHouse/ClickHouse/pull/62250) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix PartsSplitter [#62268](https://github.com/ClickHouse/ClickHouse/pull/62268) ([Nikita Taranov](https://github.com/nickitat)).
+* Analyzer: Fix alias to parametrized view resolution [#62274](https://github.com/ClickHouse/ClickHouse/pull/62274) ([Dmitry Novik](https://github.com/novikd)).
+* Analyzer: Fix name resolution from parent scopes [#62281](https://github.com/ClickHouse/ClickHouse/pull/62281) ([Dmitry Novik](https://github.com/novikd)).
+* Fix argMax with nullable non native numeric column [#62285](https://github.com/ClickHouse/ClickHouse/pull/62285) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix BACKUP and RESTORE of a materialized view in Ordinary database [#62295](https://github.com/ClickHouse/ClickHouse/pull/62295) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix data race on scalars in Context [#62305](https://github.com/ClickHouse/ClickHouse/pull/62305) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix primary key in materialized view [#62319](https://github.com/ClickHouse/ClickHouse/pull/62319) ([Murat Khairulin](https://github.com/mxwell)).
+* Do not build multithread insert pipeline for tables without support [#62333](https://github.com/ClickHouse/ClickHouse/pull/62333) ([vdimir](https://github.com/vdimir)).
+* Fix analyzer with positional arguments in distributed query [#62362](https://github.com/ClickHouse/ClickHouse/pull/62362) ([flynn](https://github.com/ucasfl)).
+* Fix filter pushdown from additional_table_filters in Merge engine in analyzer [#62398](https://github.com/ClickHouse/ClickHouse/pull/62398) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix GLOBAL IN table queries with analyzer. [#62409](https://github.com/ClickHouse/ClickHouse/pull/62409) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Respect settings truncate_on_insert/create_new_file_on_insert in s3/hdfs/azure engines during partitioned write [#62425](https://github.com/ClickHouse/ClickHouse/pull/62425) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix backup restore path for AzureBlobStorage [#62447](https://github.com/ClickHouse/ClickHouse/pull/62447) ([SmitaRKulkarni](https://github.com/SmitaRKulkarni)).
+* Fix SimpleSquashingChunksTransform [#62451](https://github.com/ClickHouse/ClickHouse/pull/62451) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix capture of nested lambda. [#62462](https://github.com/ClickHouse/ClickHouse/pull/62462) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix validation of special MergeTree columns [#62498](https://github.com/ClickHouse/ClickHouse/pull/62498) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Avoid crash when reading protobuf with recursive types [#62506](https://github.com/ClickHouse/ClickHouse/pull/62506) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix a bug moving one partition from one to itself [#62524](https://github.com/ClickHouse/ClickHouse/pull/62524) ([helifu](https://github.com/helifu)).
+* Fix scalar subquery in LIMIT [#62567](https://github.com/ClickHouse/ClickHouse/pull/62567) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Try to fix segfault in Hive engine [#62578](https://github.com/ClickHouse/ClickHouse/pull/62578) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix memory leak in groupArraySorted [#62597](https://github.com/ClickHouse/ClickHouse/pull/62597) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix crash in largestTriangleThreeBuckets [#62646](https://github.com/ClickHouse/ClickHouse/pull/62646) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix tumble[Start,End] and hop[Start,End] for bigger resolutions [#62705](https://github.com/ClickHouse/ClickHouse/pull/62705) ([Jordi Villar](https://github.com/jrdi)).
+* Fix argMin/argMax combinator state [#62708](https://github.com/ClickHouse/ClickHouse/pull/62708) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix temporary data in cache failing because of cache lock contention optimization [#62715](https://github.com/ClickHouse/ClickHouse/pull/62715) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix crash in function `mergeTreeIndex` [#62762](https://github.com/ClickHouse/ClickHouse/pull/62762) ([Anton Popov](https://github.com/CurtizJ)).
+* fix: update: nested materialized columns: size check fixes [#62773](https://github.com/ClickHouse/ClickHouse/pull/62773) ([Eliot Hautefeuille](https://github.com/hileef)).
+* Fix FINAL modifier is not respected in CTE with analyzer [#62811](https://github.com/ClickHouse/ClickHouse/pull/62811) ([Duc Canh Le](https://github.com/canhld94)).
+* Fix crash in function `formatRow` with `JSON` format and HTTP  interface [#62840](https://github.com/ClickHouse/ClickHouse/pull/62840) ([Anton Popov](https://github.com/CurtizJ)).
+* Azure: fix building final url from endpoint object [#62850](https://github.com/ClickHouse/ClickHouse/pull/62850) ([Daniel Pozo Escalona](https://github.com/danipozo)).
+* Fix GCD codec [#62853](https://github.com/ClickHouse/ClickHouse/pull/62853) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix LowCardinality(Nullable) key in hyperrectangle [#62866](https://github.com/ClickHouse/ClickHouse/pull/62866) ([Amos Bird](https://github.com/amosbird)).
+* Fix fromUnixtimestamp in joda syntax while the input value beyond UInt32 [#62901](https://github.com/ClickHouse/ClickHouse/pull/62901) ([KevinyhZou](https://github.com/KevinyhZou)).
+* Disable optimize_rewrite_aggregate_function_with_if for sum(nullable) [#62912](https://github.com/ClickHouse/ClickHouse/pull/62912) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix PREWHERE for StorageBuffer with different source table column types. [#62916](https://github.com/ClickHouse/ClickHouse/pull/62916) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix temporary data in cache incorrectly processing failure of cache key directory creation [#62925](https://github.com/ClickHouse/ClickHouse/pull/62925) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* gRPC: fix crash on IPv6 peer connection [#62978](https://github.com/ClickHouse/ClickHouse/pull/62978) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Fix possible CHECKSUM_DOESNT_MATCH (and others) during replicated fetches [#62987](https://github.com/ClickHouse/ClickHouse/pull/62987) ([Azat Khuzhin](https://github.com/azat)).
+* Fix terminate with uncaught exception in temporary data in cache [#62998](https://github.com/ClickHouse/ClickHouse/pull/62998) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix optimize_rewrite_aggregate_function_with_if implicit cast [#62999](https://github.com/ClickHouse/ClickHouse/pull/62999) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix unhandled exception in ~RestorerFromBackup [#63040](https://github.com/ClickHouse/ClickHouse/pull/63040) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Do not remove server constants from GROUP BY key for secondary query. [#63047](https://github.com/ClickHouse/ClickHouse/pull/63047) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix incorrect judgement of of monotonicity of function abs [#63097](https://github.com/ClickHouse/ClickHouse/pull/63097) ([Duc Canh Le](https://github.com/canhld94)).
+* Make sanity check of settings worse [#63119](https://github.com/ClickHouse/ClickHouse/pull/63119) ([Raúl Marín](https://github.com/Algunenano)).
+* Set server name for SSL handshake in MongoDB engine [#63122](https://github.com/ClickHouse/ClickHouse/pull/63122) ([Alexander Gololobov](https://github.com/davenger)).
+* Use user specified db instead of "config" for MongoDB wire protocol version check [#63126](https://github.com/ClickHouse/ClickHouse/pull/63126) ([Alexander Gololobov](https://github.com/davenger)).
+* Format SQL security option only in `CREATE VIEW` queries. [#63136](https://github.com/ClickHouse/ClickHouse/pull/63136) ([pufit](https://github.com/pufit)).
+
+#### CI Fix or Improvement (changelog entry is not required)
+
+* ... [#62044](https://github.com/ClickHouse/ClickHouse/pull/62044) ([Max K.](https://github.com/maxknv)).
+* We won't fail the job when GH fails to retrieve the job ID and URLs. [#62651](https://github.com/ClickHouse/ClickHouse/pull/62651) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Decouple some work from https://github.com/ClickHouse/ClickHouse/pull/61464 to simplify sync. [#62739](https://github.com/ClickHouse/ClickHouse/pull/62739) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Add `isort` config fo the first-party imports; fail build reports on non-success statuses. [#62786](https://github.com/ClickHouse/ClickHouse/pull/62786) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Move all Labels around to have it in a single place. [#62919](https://github.com/ClickHouse/ClickHouse/pull/62919) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* ... [#63035](https://github.com/ClickHouse/ClickHouse/pull/63035) ([Aleksei Filatov](https://github.com/aalexfvk)).
+* ... [#63108](https://github.com/ClickHouse/ClickHouse/pull/63108) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
+#### NO CL ENTRY
+
+* NO CL ENTRY:  'Revert "Revert "Updated format settings references in the docs (datetime.md)""'. [#61442](https://github.com/ClickHouse/ClickHouse/pull/61442) ([Kruglov Pavel](https://github.com/Avogar)).
+* NO CL ENTRY:  'Write `binary version -> commit hash` mapping to CI database (in private)'. [#61544](https://github.com/ClickHouse/ClickHouse/pull/61544) ([Nikita Taranov](https://github.com/nickitat)).
+* NO CL ENTRY:  'Fix flaky tests 2 (stateless, integration) '. [#61869](https://github.com/ClickHouse/ClickHouse/pull/61869) ([Nikita Fomichev](https://github.com/fm4v)).
+* NO CL ENTRY:  'Fix PR [#60656](https://github.com/ClickHouse/ClickHouse/issues/60656) for install check tests'. [#61910](https://github.com/ClickHouse/ClickHouse/pull/61910) ([Chun-Sheng, Li](https://github.com/peter279k)).
+* NO CL ENTRY:  '00002_log_and_exception_messages_formatting: exclude one more format string'. [#62190](https://github.com/ClickHouse/ClickHouse/pull/62190) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* NO CL ENTRY:  'Revert "Resubmit 'Update invalidate_query_response on dictionary startup'"'. [#62230](https://github.com/ClickHouse/ClickHouse/pull/62230) ([Raúl Marín](https://github.com/Algunenano)).
+* NO CL ENTRY:  'Fix contributor name vulnerability'. [#62357](https://github.com/ClickHouse/ClickHouse/pull/62357) ([Anita Hammer](https://github.com/anitahammer)).
+* NO CL ENTRY:  'Revert "Rich syntax highlighting in the client"'. [#62508](https://github.com/ClickHouse/ClickHouse/pull/62508) ([Raúl Marín](https://github.com/Algunenano)).
+* NO CL ENTRY:  'Revert "Revert "Rich syntax highlighting in the client""'. [#62512](https://github.com/ClickHouse/ClickHouse/pull/62512) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* NO CL ENTRY:  'Revert "[feature]: allow to attach parts from a different disk"'. [#62549](https://github.com/ClickHouse/ClickHouse/pull/62549) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "More optimal loading of marks"'. [#62577](https://github.com/ClickHouse/ClickHouse/pull/62577) ([Nikita Taranov](https://github.com/nickitat)).
+* NO CL ENTRY:  'Revert "Speed up `splitByRegexp`"'. [#62692](https://github.com/ClickHouse/ClickHouse/pull/62692) ([Robert Schulze](https://github.com/rschu1ze)).
+* NO CL ENTRY:  'Get rid of merge_commit in style check autofix'. [#62835](https://github.com/ClickHouse/ClickHouse/pull/62835) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* NO CL ENTRY:  'Revert "CI: add FT to MQ remove Style from master"'. [#62927](https://github.com/ClickHouse/ClickHouse/pull/62927) ([Max K.](https://github.com/maxknv)).
+* NO CL ENTRY:  'Unflake 02813_func_now_and_alias'. [#62932](https://github.com/ClickHouse/ClickHouse/pull/62932) ([Robert Schulze](https://github.com/rschu1ze)).
+* NO CL ENTRY:  'Revert "Enable custom parquet encoder by default"'. [#63153](https://github.com/ClickHouse/ClickHouse/pull/63153) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Update protobuf to v25.1 [#58020](https://github.com/ClickHouse/ClickHouse/pull/58020) ([Mikhail Koviazin](https://github.com/mkmkme)).
+* boringssl --> OpenSSL 3.2 [#59870](https://github.com/ClickHouse/ClickHouse/pull/59870) ([Robert Schulze](https://github.com/rschu1ze)).
+* Enable all access control improvements by default (even without config.xml) [#60153](https://github.com/ClickHouse/ClickHouse/pull/60153) ([Azat Khuzhin](https://github.com/azat)).
+* Change back how receive_timeout is handled for INSERTs [#60302](https://github.com/ClickHouse/ClickHouse/pull/60302) ([Azat Khuzhin](https://github.com/azat)).
+* Context getGlobalTemporaryVolume use shared lock [#60997](https://github.com/ClickHouse/ClickHouse/pull/60997) ([Maksim Kita](https://github.com/kitaisreal)).
+* Do nothing in `waitForOutdatedPartsToBeLoaded()` if loading is not required [#61232](https://github.com/ClickHouse/ClickHouse/pull/61232) ([Sergei Trifonov](https://github.com/serxa)).
+* Fix db iterator wait during async metrics collection [#61534](https://github.com/ClickHouse/ClickHouse/pull/61534) ([Sergei Trifonov](https://github.com/serxa)).
+* Fix 02943_rmt_alter_metadata_merge_checksum_mismatch flakiness [#61594](https://github.com/ClickHouse/ClickHouse/pull/61594) ([Azat Khuzhin](https://github.com/azat)).
+* Stream rows when reading from system.replicas [#61784](https://github.com/ClickHouse/ClickHouse/pull/61784) ([Alexander Gololobov](https://github.com/davenger)).
+* Skip more sanity checks for secondary create queries [#61799](https://github.com/ClickHouse/ClickHouse/pull/61799) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Fix 00002_log_and_exception_messages_formatting [#61882](https://github.com/ClickHouse/ClickHouse/pull/61882) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Add test for [#53352](https://github.com/ClickHouse/ClickHouse/issues/53352) [#61886](https://github.com/ClickHouse/ClickHouse/pull/61886) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Test: tuple elimination with analyzer [#61887](https://github.com/ClickHouse/ClickHouse/pull/61887) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix performance test `aggregating_merge_tree_simple_aggregate_function_string` [#61931](https://github.com/ClickHouse/ClickHouse/pull/61931) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Fix some crashes with analyzer and group_by_use_nulls. [#61933](https://github.com/ClickHouse/ClickHouse/pull/61933) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* fix a race in refreshable view [#61936](https://github.com/ClickHouse/ClickHouse/pull/61936) ([Han Fei](https://github.com/hanfei1991)).
+* Follow up to [#60452](https://github.com/ClickHouse/ClickHouse/issues/60452) [#61954](https://github.com/ClickHouse/ClickHouse/pull/61954) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Update 02916_move_partition_inactive_replica.sql [#61955](https://github.com/ClickHouse/ClickHouse/pull/61955) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Check for "SYSTEM STOP MERGES" primarily for MERGE_PARTS/MUTATE_PART [#61976](https://github.com/ClickHouse/ClickHouse/pull/61976) ([Azat Khuzhin](https://github.com/azat)).
+* CI: failover for job_url request from gh [#61986](https://github.com/ClickHouse/ClickHouse/pull/61986) ([Max K.](https://github.com/maxknv)).
+* CI: remove unnecessary job url for Mark release ready [#61991](https://github.com/ClickHouse/ClickHouse/pull/61991) ([Max K.](https://github.com/maxknv)).
+* Update version after release [#61994](https://github.com/ClickHouse/ClickHouse/pull/61994) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Update version_date.tsv and changelogs after v24.3.1.2672-lts [#61996](https://github.com/ClickHouse/ClickHouse/pull/61996) ([robot-clickhouse](https://github.com/robot-clickhouse)).
+* [RFC] Send LOGICAL_ERRORs to sentry [#61997](https://github.com/ClickHouse/ClickHouse/pull/61997) ([Azat Khuzhin](https://github.com/azat)).
+* Fix scalars create as select [#61998](https://github.com/ClickHouse/ClickHouse/pull/61998) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix clickhouse-test [#62016](https://github.com/ClickHouse/ClickHouse/pull/62016) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix logs saving in DatabaseReplicated tests [#62019](https://github.com/ClickHouse/ClickHouse/pull/62019) ([Nikolay Degterinsky](https://github.com/evillique)).
+* fix npy big endianness [#62020](https://github.com/ClickHouse/ClickHouse/pull/62020) ([豪肥肥](https://github.com/HowePa)).
+* Update analyzer_tech_debt.txt [#62035](https://github.com/ClickHouse/ClickHouse/pull/62035) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Add analyzer pattern to 00002_log_and_exception_messages_formatting [#62038](https://github.com/ClickHouse/ClickHouse/pull/62038) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix clickhouse-test in case of missing .reference file [#62041](https://github.com/ClickHouse/ClickHouse/pull/62041) ([Azat Khuzhin](https://github.com/azat)).
+* Fix optimize_arithmetic_operations_in_aggregate_functions  [#62046](https://github.com/ClickHouse/ClickHouse/pull/62046) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Update DatabaseOnDisk.cpp [#62049](https://github.com/ClickHouse/ClickHouse/pull/62049) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Ignore IfChainToMultiIfPass if returned type changed. [#62059](https://github.com/ClickHouse/ClickHouse/pull/62059) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Support more that 255 replicas in system table [#62064](https://github.com/ClickHouse/ClickHouse/pull/62064) ([Alexander Gololobov](https://github.com/davenger)).
+* Fix stress tests for analyzer due to experimental WINDOW VIEW (by disabling it) [#62065](https://github.com/ClickHouse/ClickHouse/pull/62065) ([Azat Khuzhin](https://github.com/azat)).
+* Fix type for ConvertInToEqualPass [#62066](https://github.com/ClickHouse/ClickHouse/pull/62066) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Remove Dead Code [#62082](https://github.com/ClickHouse/ClickHouse/pull/62082) ([jsc0218](https://github.com/jsc0218)).
+* Revert output Pretty in tty [#62090](https://github.com/ClickHouse/ClickHouse/pull/62090) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* More than 255 replicas in ReplicatedTableStatus [#62127](https://github.com/ClickHouse/ClickHouse/pull/62127) ([Alexander Gololobov](https://github.com/davenger)).
+* Fix upgrade check [#62136](https://github.com/ClickHouse/ClickHouse/pull/62136) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix 0320_long_values_pretty_are_not_cut_if_single [#62150](https://github.com/ClickHouse/ClickHouse/pull/62150) ([Duc Canh Le](https://github.com/canhld94)).
+* Update NuRaft [#62156](https://github.com/ClickHouse/ClickHouse/pull/62156) ([Antonio Andelic](https://github.com/antonio2368)).
+* Unify lightweight mutation control [#62159](https://github.com/ClickHouse/ClickHouse/pull/62159) ([Raúl Marín](https://github.com/Algunenano)).
+* Add some logging [#62160](https://github.com/ClickHouse/ClickHouse/pull/62160) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix xml part in documentation [#62169](https://github.com/ClickHouse/ClickHouse/pull/62169) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+* Remove a few nested include dependencies [#62170](https://github.com/ClickHouse/ClickHouse/pull/62170) ([Raúl Marín](https://github.com/Algunenano)).
+* User specific S3 endpoint for Backup/Restore on cluster [#62175](https://github.com/ClickHouse/ClickHouse/pull/62175) ([Antonio Andelic](https://github.com/antonio2368)).
+* Bump `double-conversion` submodule [#62177](https://github.com/ClickHouse/ClickHouse/pull/62177) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix `retention` docs [#62182](https://github.com/ClickHouse/ClickHouse/pull/62182) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+* Fix 02503_insert_storage_snapshot [#62194](https://github.com/ClickHouse/ClickHouse/pull/62194) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Use ClickHouse threads in NuRaft [#62196](https://github.com/ClickHouse/ClickHouse/pull/62196) ([alesapin](https://github.com/alesapin)).
+* Unflake and speed up `01676_clickhouse_client_autocomplete` [#62209](https://github.com/ClickHouse/ClickHouse/pull/62209) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Fix build with clang-19 (master) [#62212](https://github.com/ClickHouse/ClickHouse/pull/62212) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add more documentation to the release script [#62213](https://github.com/ClickHouse/ClickHouse/pull/62213) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Update version_date.tsv and changelogs after v24.3.2.23-lts [#62214](https://github.com/ClickHouse/ClickHouse/pull/62214) ([robot-clickhouse](https://github.com/robot-clickhouse)).
+* Unlimited output_format_pretty_max_value_width for --pager [#62221](https://github.com/ClickHouse/ClickHouse/pull/62221) ([Azat Khuzhin](https://github.com/azat)).
+* Include table name in paranoid checks [#62232](https://github.com/ClickHouse/ClickHouse/pull/62232) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix another logical error in group_by_use_nulls. [#62236](https://github.com/ClickHouse/ClickHouse/pull/62236) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Remove reverted PR from 24.3 changelog [#62251](https://github.com/ClickHouse/ClickHouse/pull/62251) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix lambda(tuple(x), x + 1) syntax in analyzer [#62253](https://github.com/ClickHouse/ClickHouse/pull/62253) ([vdimir](https://github.com/vdimir)).
+* Fix s3-style link mapper for gcs [#62257](https://github.com/ClickHouse/ClickHouse/pull/62257) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+* Disable 02980_dist_insert_readonly_replica for SMT [#62260](https://github.com/ClickHouse/ClickHouse/pull/62260) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix logical error from fs cache in stress test [#62261](https://github.com/ClickHouse/ClickHouse/pull/62261) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Remove more nested includes [#62264](https://github.com/ClickHouse/ClickHouse/pull/62264) ([Raúl Marín](https://github.com/Algunenano)).
+* Don't access static members through instance [#62265](https://github.com/ClickHouse/ClickHouse/pull/62265) ([Robert Schulze](https://github.com/rschu1ze)).
+* Add fault injection for "Cannot allocate thread" [#62266](https://github.com/ClickHouse/ClickHouse/pull/62266) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Analyzer: limit maximal size of column in constant folding [#62273](https://github.com/ClickHouse/ClickHouse/pull/62273) ([vdimir](https://github.com/vdimir)).
+* Fix __actionName, add tests for internal functions direct call [#62287](https://github.com/ClickHouse/ClickHouse/pull/62287) ([vdimir](https://github.com/vdimir)).
+* Fix `mortonEncode` `use-of-uninitialized-value` [#62288](https://github.com/ClickHouse/ClickHouse/pull/62288) ([Antonio Andelic](https://github.com/antonio2368)).
+* Add local address to network exception messages [#62300](https://github.com/ClickHouse/ClickHouse/pull/62300) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Better handling of errors from azure storage [#62306](https://github.com/ClickHouse/ClickHouse/pull/62306) ([Anton Popov](https://github.com/CurtizJ)).
+* Cleanup SSH-based authentication code [#62307](https://github.com/ClickHouse/ClickHouse/pull/62307) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix data race in LocalServer [#62309](https://github.com/ClickHouse/ClickHouse/pull/62309) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix for postprocess script: print correct count for frame  [#62317](https://github.com/ClickHouse/ClickHouse/pull/62317) ([Antonio Andelic](https://github.com/antonio2368)).
+* Use DETACHED_DIR_NAME everywhere [#62318](https://github.com/ClickHouse/ClickHouse/pull/62318) ([Azat Khuzhin](https://github.com/azat)).
+* Fix small typo in Dictionary source loader [#62320](https://github.com/ClickHouse/ClickHouse/pull/62320) ([Sean Haynes](https://github.com/seandhaynes)).
+* Fix optimize_uniq_to_count when only prefix of key is matched [#62325](https://github.com/ClickHouse/ClickHouse/pull/62325) ([vdimir](https://github.com/vdimir)).
+* More complex locking in `StackTrace::toString` [#62332](https://github.com/ClickHouse/ClickHouse/pull/62332) ([Antonio Andelic](https://github.com/antonio2368)).
+* Analyzer: Fix PREWHERE with lambda functions [#62336](https://github.com/ClickHouse/ClickHouse/pull/62336) ([vdimir](https://github.com/vdimir)).
+* Reduce log levels for ReadWriteBufferFromHTTP retries [#62348](https://github.com/ClickHouse/ClickHouse/pull/62348) ([Alexander Gololobov](https://github.com/davenger)).
+* dhparams are not enabled by default [#62365](https://github.com/ClickHouse/ClickHouse/pull/62365) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Disable window view with analyzer properly [#62367](https://github.com/ClickHouse/ClickHouse/pull/62367) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Don't access static members through instance, pt. II [#62375](https://github.com/ClickHouse/ClickHouse/pull/62375) ([Robert Schulze](https://github.com/rschu1ze)).
+* Use function isNotDistinctFrom only in join key [#62387](https://github.com/ClickHouse/ClickHouse/pull/62387) ([vdimir](https://github.com/vdimir)).
+* CI: fix for docs only pr [#62396](https://github.com/ClickHouse/ClickHouse/pull/62396) ([Max K.](https://github.com/maxknv)).
+* Fix one phony case [#62397](https://github.com/ClickHouse/ClickHouse/pull/62397) ([Raúl Marín](https://github.com/Algunenano)).
+* CI: test merge queue [#62403](https://github.com/ClickHouse/ClickHouse/pull/62403) ([Max K.](https://github.com/maxknv)).
+* Add part name to check part exception message [#62408](https://github.com/ClickHouse/ClickHouse/pull/62408) ([Igor Nikonov](https://github.com/devcrafter)).
+* CI: disable finish check for mq [#62410](https://github.com/ClickHouse/ClickHouse/pull/62410) ([Max K.](https://github.com/maxknv)).
+* Fix logical error 'numbers_storage.step != UInt64{0}' [#62413](https://github.com/ClickHouse/ClickHouse/pull/62413) ([Kruglov Pavel](https://github.com/Avogar)).
+* Don't check overflow in  arrayDotProduct in undefined sanitizer [#62417](https://github.com/ClickHouse/ClickHouse/pull/62417) ([Kruglov Pavel](https://github.com/Avogar)).
+* Avoid uncaught exception for onFault handler [#62418](https://github.com/ClickHouse/ClickHouse/pull/62418) ([Azat Khuzhin](https://github.com/azat)).
+* Update StorageFileLog.cpp [#62421](https://github.com/ClickHouse/ClickHouse/pull/62421) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Support for a tiny feature in stateless tests image [#62427](https://github.com/ClickHouse/ClickHouse/pull/62427) ([Nikolay Degterinsky](https://github.com/evillique)).
+* OptimizeGroupByInjectiveFunctionsPass remove unused constant [#62433](https://github.com/ClickHouse/ClickHouse/pull/62433) ([Maksim Kita](https://github.com/kitaisreal)).
+* Perf script update path in documentation [#62439](https://github.com/ClickHouse/ClickHouse/pull/62439) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix completion of available ClickHouse tools [#62446](https://github.com/ClickHouse/ClickHouse/pull/62446) ([Azat Khuzhin](https://github.com/azat)).
+* Use shared mutex for global stacktrace cache [#62453](https://github.com/ClickHouse/ClickHouse/pull/62453) ([Sergei Trifonov](https://github.com/serxa)).
+* Keeper logging fixes [#62455](https://github.com/ClickHouse/ClickHouse/pull/62455) ([Alexander Gololobov](https://github.com/davenger)).
+* Add profile events for azure disk [#62458](https://github.com/ClickHouse/ClickHouse/pull/62458) ([Anton Popov](https://github.com/CurtizJ)).
+* CI: gh runner version 2.315.0 [#62461](https://github.com/ClickHouse/ClickHouse/pull/62461) ([Max K.](https://github.com/maxknv)).
+* Fix clang-tidy build [#62478](https://github.com/ClickHouse/ClickHouse/pull/62478) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix random clang tidy warning [#62480](https://github.com/ClickHouse/ClickHouse/pull/62480) ([Raúl Marín](https://github.com/Algunenano)).
+* Disable external sort in 01592_long_window_functions1 [#62487](https://github.com/ClickHouse/ClickHouse/pull/62487) ([Nikita Taranov](https://github.com/nickitat)).
+* CI: merge sync pr on push to master [#62488](https://github.com/ClickHouse/ClickHouse/pull/62488) ([Max K.](https://github.com/maxknv)).
+* Don't allow the fuzzer to change allow_experimental_analyzer [#62500](https://github.com/ClickHouse/ClickHouse/pull/62500) ([Raúl Marín](https://github.com/Algunenano)).
+* Update comment in 02911_support_alias_column_in_indices.sql [#62503](https://github.com/ClickHouse/ClickHouse/pull/62503) ([Robert Schulze](https://github.com/rschu1ze)).
+* Add test for [#26674](https://github.com/ClickHouse/ClickHouse/issues/26674) [#62504](https://github.com/ClickHouse/ClickHouse/pull/62504) ([Raúl Marín](https://github.com/Algunenano)).
+* Add test for Bug 37909 [#62509](https://github.com/ClickHouse/ClickHouse/pull/62509) ([Robert Schulze](https://github.com/rschu1ze)).
+* Add test for bug [#33446](https://github.com/ClickHouse/ClickHouse/issues/33446) [#62511](https://github.com/ClickHouse/ClickHouse/pull/62511) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix upgrade test. Again [#62513](https://github.com/ClickHouse/ClickHouse/pull/62513) ([Raúl Marín](https://github.com/Algunenano)).
+* Blind fix for a flaky test [#62516](https://github.com/ClickHouse/ClickHouse/pull/62516) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add asserts to COW example programs [#62543](https://github.com/ClickHouse/ClickHouse/pull/62543) ([Tomer Shafir](https://github.com/tomershafir)).
+* CI: respect Sync status in the MQ [#62550](https://github.com/ClickHouse/ClickHouse/pull/62550) ([Max K.](https://github.com/maxknv)).
+* Fix assertion in stress test [#62551](https://github.com/ClickHouse/ClickHouse/pull/62551) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix flaky 03093_bug37909_query_does_not_finish [#62553](https://github.com/ClickHouse/ClickHouse/pull/62553) ([Robert Schulze](https://github.com/rschu1ze)).
+* Add test for issue 24607 [#62554](https://github.com/ClickHouse/ClickHouse/pull/62554) ([Robert Schulze](https://github.com/rschu1ze)).
+* Follow up to [#61723](https://github.com/ClickHouse/ClickHouse/issues/61723) [#62555](https://github.com/ClickHouse/ClickHouse/pull/62555) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix integration-tests logs compression [#62556](https://github.com/ClickHouse/ClickHouse/pull/62556) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Try to fix if_transform_strings_to_enum performance test [#62558](https://github.com/ClickHouse/ClickHouse/pull/62558) ([Dmitry Novik](https://github.com/novikd)).
+* Always use new analyzer in performance tests [#62564](https://github.com/ClickHouse/ClickHouse/pull/62564) ([Dmitry Novik](https://github.com/novikd)).
+* CI: Add tests with Azure storage [#62565](https://github.com/ClickHouse/ClickHouse/pull/62565) ([Max K.](https://github.com/maxknv)).
+* CI: fix for sync check status in mq [#62568](https://github.com/ClickHouse/ClickHouse/pull/62568) ([Max K.](https://github.com/maxknv)).
+* Remove mentions of clean_deleted_rows from the documentation [#62570](https://github.com/ClickHouse/ClickHouse/pull/62570) ([Raúl Marín](https://github.com/Algunenano)).
+* Try to fix Bugfix validation job [#62579](https://github.com/ClickHouse/ClickHouse/pull/62579) ([Raúl Marín](https://github.com/Algunenano)).
+* CI: add FT to MQ remove Style from master [#62588](https://github.com/ClickHouse/ClickHouse/pull/62588) ([Max K.](https://github.com/maxknv)).
+* CI: MQ sync status check fix [#62591](https://github.com/ClickHouse/ClickHouse/pull/62591) ([Max K.](https://github.com/maxknv)).
+* Better retries in azure sdk [#62608](https://github.com/ClickHouse/ClickHouse/pull/62608) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix: msan in UUIDStringToNum [#62610](https://github.com/ClickHouse/ClickHouse/pull/62610) ([Igor Nikonov](https://github.com/devcrafter)).
+* Fix a typo and grammar in `intersect` [#62622](https://github.com/ClickHouse/ClickHouse/pull/62622) ([Josh Rodriguez](https://github.com/jrodshua)).
+* JOIN filter push down right stream filled crash fix [#62624](https://github.com/ClickHouse/ClickHouse/pull/62624) ([Maksim Kita](https://github.com/kitaisreal)).
+* HashedDictionaryParallelLoader exception safe constructor [#62625](https://github.com/ClickHouse/ClickHouse/pull/62625) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix 02366_kql_summarize [#62642](https://github.com/ClickHouse/ClickHouse/pull/62642) ([Nikita Taranov](https://github.com/nickitat)).
+* Disable 02581_share_big_sets_between_mutation_tasks under sanitizers [#62645](https://github.com/ClickHouse/ClickHouse/pull/62645) ([Nikita Taranov](https://github.com/nickitat)).
+* Don't allow relative paths when installing [#62658](https://github.com/ClickHouse/ClickHouse/pull/62658) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Update TransactionLog.cpp [#62663](https://github.com/ClickHouse/ClickHouse/pull/62663) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Disable aggregation-by-partitions optimisation with parallel replicas [#62697](https://github.com/ClickHouse/ClickHouse/pull/62697) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix build when `$CC` isn't set [#62700](https://github.com/ClickHouse/ClickHouse/pull/62700) ([Robert Schulze](https://github.com/rschu1ze)).
+* Bump Azure to 1.8.0 [#62702](https://github.com/ClickHouse/ClickHouse/pull/62702) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix --client-option for $CLICKHOUSE_CLIENT in .sh tests [#62710](https://github.com/ClickHouse/ClickHouse/pull/62710) ([Azat Khuzhin](https://github.com/azat)).
+* Bump Azure to v1.10 [#62712](https://github.com/ClickHouse/ClickHouse/pull/62712) ([Robert Schulze](https://github.com/rschu1ze)).
+* Bump Azure to v1.11 [#62713](https://github.com/ClickHouse/ClickHouse/pull/62713) ([Robert Schulze](https://github.com/rschu1ze)).
+* `Trunc` docs fix [#62720](https://github.com/ClickHouse/ClickHouse/pull/62720) ([Yarik Briukhovetskyi](https://github.com/yariks5s)).
+* Unify a batch of tests [#62723](https://github.com/ClickHouse/ClickHouse/pull/62723) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix typo in exception explanation [#62740](https://github.com/ClickHouse/ClickHouse/pull/62740) ([Igor Markelov](https://github.com/ElderlyPassionFruit)).
+* Block cannot allocate thread fault in noexcept functions in `MergeTreeTransaction` [#62751](https://github.com/ClickHouse/ClickHouse/pull/62751) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Log profile events send timings [#62752](https://github.com/ClickHouse/ClickHouse/pull/62752) ([Alexander Gololobov](https://github.com/davenger)).
+* Follow-up to [#62700](https://github.com/ClickHouse/ClickHouse/issues/62700): Fix build when `$CC` isn't set [#62754](https://github.com/ClickHouse/ClickHouse/pull/62754) ([Robert Schulze](https://github.com/rschu1ze)).
+* Analyzer: Fix exception message [#62755](https://github.com/ClickHouse/ClickHouse/pull/62755) ([Dmitry Novik](https://github.com/novikd)).
+* Fix shellcheck style checking and issues [#62761](https://github.com/ClickHouse/ClickHouse/pull/62761) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Fix taking full part if part contains less than 'limit' rows [#62812](https://github.com/ClickHouse/ClickHouse/pull/62812) ([Artur Malchanau](https://github.com/Hexta)).
+* TableEngineGrant: undo breaking change [#62828](https://github.com/ClickHouse/ClickHouse/pull/62828) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Fix typo [#62836](https://github.com/ClickHouse/ClickHouse/pull/62836) ([Robert Schulze](https://github.com/rschu1ze)).
+* Revert "Add test for bug [#33446](https://github.com/ClickHouse/ClickHouse/issues/33446)" [#62844](https://github.com/ClickHouse/ClickHouse/pull/62844) ([Robert Schulze](https://github.com/rschu1ze)).
+* SYSTEM DROP uninitialized cache fix [#62868](https://github.com/ClickHouse/ClickHouse/pull/62868) ([Maksim Kita](https://github.com/kitaisreal)).
+* PlannerJoins remove unused comments [#62874](https://github.com/ClickHouse/ClickHouse/pull/62874) ([Maksim Kita](https://github.com/kitaisreal)).
+* Add test for bug 33446 [#62880](https://github.com/ClickHouse/ClickHouse/pull/62880) ([Robert Schulze](https://github.com/rschu1ze)).
+* Build kererberized_hadoop image by downloading commons-daemon via https [#62886](https://github.com/ClickHouse/ClickHouse/pull/62886) ([Ilya Golshtein](https://github.com/ilejn)).
+* Update run.sh [#62889](https://github.com/ClickHouse/ClickHouse/pull/62889) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix build failed on clang-18 [#62899](https://github.com/ClickHouse/ClickHouse/pull/62899) ([LiuNeng](https://github.com/liuneng1994)).
+* Fix parsing of nested proto messages [#62906](https://github.com/ClickHouse/ClickHouse/pull/62906) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix `00993_system_parts_race_condition_drop_zookeeper` [#62908](https://github.com/ClickHouse/ClickHouse/pull/62908) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix 03013_forbid_attach_table_if_active_replica_already_exists for private [#62909](https://github.com/ClickHouse/ClickHouse/pull/62909) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix 03015_optimize_final_rmt in private [#62911](https://github.com/ClickHouse/ClickHouse/pull/62911) ([Nikita Taranov](https://github.com/nickitat)).
+* Add some functions to zookeeper client [#62920](https://github.com/ClickHouse/ClickHouse/pull/62920) ([alesapin](https://github.com/alesapin)).
+* Fix build on Mac using clang-18 [#62954](https://github.com/ClickHouse/ClickHouse/pull/62954) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Reapply: CI add FT to MQ remove Style from master [#62963](https://github.com/ClickHouse/ClickHouse/pull/62963) ([Max K.](https://github.com/maxknv)).
+* Fix flaky 03128_argMin_combinator_projection [#62965](https://github.com/ClickHouse/ClickHouse/pull/62965) ([Raúl Marín](https://github.com/Algunenano)).
+* Better exception message [#62967](https://github.com/ClickHouse/ClickHouse/pull/62967) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix race in `executeJob` when updating exception message [#62972](https://github.com/ClickHouse/ClickHouse/pull/62972) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Remove incorrect assertion from DatabaseReplicated [#63000](https://github.com/ClickHouse/ClickHouse/pull/63000) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* Update version_date.tsv and changelogs after v23.8.13.25-lts [#63014](https://github.com/ClickHouse/ClickHouse/pull/63014) ([robot-clickhouse](https://github.com/robot-clickhouse)).
+* JIT sort description crash fix [#63024](https://github.com/ClickHouse/ClickHouse/pull/63024) ([Maksim Kita](https://github.com/kitaisreal)).
+* CI: fix ci config to run FT in MQ [#63025](https://github.com/ClickHouse/ClickHouse/pull/63025) ([Max K.](https://github.com/maxknv)).
+* Add test for [#42769](https://github.com/ClickHouse/ClickHouse/issues/42769) [#63033](https://github.com/ClickHouse/ClickHouse/pull/63033) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix suppressions for librdkafka data-race for statistics code [#63039](https://github.com/ClickHouse/ClickHouse/pull/63039) ([Azat Khuzhin](https://github.com/azat)).
+* Enable 03015_optimize_final_rmt for SMT [#63042](https://github.com/ClickHouse/ClickHouse/pull/63042) ([Nikita Taranov](https://github.com/nickitat)).
+* CI: fix job config for MQ [#63045](https://github.com/ClickHouse/ClickHouse/pull/63045) ([Max K.](https://github.com/maxknv)).
+* Unfork and update curl to 8.7.1 [#63048](https://github.com/ClickHouse/ClickHouse/pull/63048) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix integration tests with old analyzer (and fix some leftovers of enabling it) [#63069](https://github.com/ClickHouse/ClickHouse/pull/63069) ([Azat Khuzhin](https://github.com/azat)).
+* Get back test for old inter-server mode (DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET non-v2) [#63070](https://github.com/ClickHouse/ClickHouse/pull/63070) ([Azat Khuzhin](https://github.com/azat)).
+* Fix "invalid escape sequence" in clickhouse-test [#63073](https://github.com/ClickHouse/ClickHouse/pull/63073) ([Azat Khuzhin](https://github.com/azat)).
+* Fix stateful tests [#63077](https://github.com/ClickHouse/ClickHouse/pull/63077) ([alesapin](https://github.com/alesapin)).
+* Better highlighting of keywords [#63079](https://github.com/ClickHouse/ClickHouse/pull/63079) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Fix race in OpenSSL X509 store [#63109](https://github.com/ClickHouse/ClickHouse/pull/63109) ([Robert Schulze](https://github.com/rschu1ze)).
+* Azure always green [#63120](https://github.com/ClickHouse/ClickHouse/pull/63120) ([alesapin](https://github.com/alesapin)).
+* Fix flaky `03094_grouparraysorted_memory` [#63121](https://github.com/ClickHouse/ClickHouse/pull/63121) ([Antonio Andelic](https://github.com/antonio2368)).
+* test for [#56564](https://github.com/ClickHouse/ClickHouse/issues/56564) [#63124](https://github.com/ClickHouse/ClickHouse/pull/63124) ([Denny Crane](https://github.com/den-crane)).
+* Recursive CTE data race fix [#63125](https://github.com/ClickHouse/ClickHouse/pull/63125) ([Maksim Kita](https://github.com/kitaisreal)).
+* add test for [#55360](https://github.com/ClickHouse/ClickHouse/issues/55360) [#63127](https://github.com/ClickHouse/ClickHouse/pull/63127) ([flynn](https://github.com/ucasfl)).
+* add tests for [#47217](https://github.com/ClickHouse/ClickHouse/issues/47217), [#55965](https://github.com/ClickHouse/ClickHouse/issues/55965) [#63128](https://github.com/ClickHouse/ClickHouse/pull/63128) ([Denny Crane](https://github.com/den-crane)).
+* Revert "Merge pull request [#60598](https://github.com/ClickHouse/ClickHouse/issues/60598) from jrdi/week-default-mode" [#63157](https://github.com/ClickHouse/ClickHouse/pull/63157) ([Jordi Villar](https://github.com/jrdi)).
+
diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 375017c0cb0..6c07aeba466 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,3 +1,4 @@
+v24.4.1.2088-stable	2024-05-01
 v24.3.2.23-lts	2024-04-03
 v24.3.1.2672-lts	2024-03-27
 v24.2.2.71-stable	2024-03-15

From 90d6c165e32ba7ba4d4c4db86a5f93187b922d34 Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Wed, 1 May 2024 10:05:49 +0800
Subject: [PATCH 170/192] Update
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp

Co-authored-by: vdimir <vdimir@clickhouse.com>
---
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index 12ce8b654bc..decda5c7804 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -113,11 +113,7 @@ std::vector<Chunk> EmbeddedRocksDBBulkSink::squash(Chunk chunk)
     /// End of input stream
     if (chunk.getNumRows() == 0)
     {
-        if (chunks.empty())
-            return {};
-        std::vector<Chunk> to_return;
-        std::swap(to_return, chunks);
-        return to_return;
+        return std::move(chunks);
     }
 
     /// Just read block is already enough.

From 6e235ad8235b4bd8b0b15dee7a5d9aedd58ec42c Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Wed, 1 May 2024 02:06:44 +0000
Subject: [PATCH 171/192] address reviews

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp       |  3 ++-
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h         |  3 ++-
 .../0_stateless/02956_rocksdb_bulk_sink.reference      |  1 +
 tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh   | 10 +++++++++-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
index decda5c7804..1aca0edc223 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.cpp
@@ -90,7 +90,7 @@ EmbeddedRocksDBBulkSink::EmbeddedRocksDBBulkSink(
     /// If max_insert_threads > 1 we may have multiple EmbeddedRocksDBBulkSink and getContext()->getCurrentQueryId() is not guarantee to
     /// to have a distinct path. Also we cannot use query id as directory name here, because it could be defined by user and not suitable
     /// for directory name
-    auto base_directory_name = sipHash128String(getContext()->getCurrentQueryId());
+    auto base_directory_name = TMP_INSERT_PREFIX + sipHash128String(getContext()->getCurrentQueryId());
     insert_directory_queue = fs::path(storage.getDataPaths()[0]) / (base_directory_name + "-" + getRandomASCIIString(8));
     fs::create_directory(insert_directory_queue);
 }
@@ -176,6 +176,7 @@ std::pair<ColumnString::Ptr, ColumnString::Ptr> EmbeddedRocksDBBulkSink::seriali
             {
                 for (size_t idx = 0; idx < columns.size(); ++idx)
                     serializations[idx]->serializeBinary(*columns[idx], i, idx == primary_key_pos ? writer_key : writer_value, {});
+                /// String in ColumnString must be null-terminated
                 writeChar('\0', writer_key);
                 writeChar('\0', writer_value);
                 serialized_key_offsets.emplace_back(writer_key.count());
diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
index 26a4f7c7fc3..e3ca2013154 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
@@ -61,7 +61,8 @@ private:
     size_t min_block_size_rows = 0;
 
     /// For writing SST files
-    std::atomic_size_t file_counter = 0;
+    size_t file_counter = 0;
+    static constexpr auto TMP_INSERT_PREFIX = "tmp_insert_";
     String insert_directory_queue;
 };
 
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
index dcf8a322ed5..74c71827e6e 100644
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.reference
@@ -1,5 +1,6 @@
 0
 1000
+1000
 1
 1000
 2
diff --git a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
index 9f771b0fcb4..8acc83fc86c 100755
--- a/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
+++ b/tests/queries/0_stateless/02956_rocksdb_bulk_sink.sh
@@ -11,8 +11,15 @@ ${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 F
 ${CLICKHOUSE_CLIENT} --query "SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens';" # should be 0 because all data is still in memtable
 ${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
 
-# With bulk insertion, there is no memtable, so a small insert should create a new file
+# Enabling bulk insertion
 ${CLICKHOUSE_CLIENT} --query "ALTER TABLE rocksdb_worm MODIFY SETTING optimize_for_bulk_insert = 1;"
+
+# Testing that key serialization is identical w. and w/o bulk sink
+${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE rocksdb_worm;"
+${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+2 FROM numbers(1000);" # should override previous keys
+${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm WHERE value = key + 2;"
+
+# With bulk insertion, there is no memtable, so a small insert should create a new file
 ${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE rocksdb_worm;"
 ${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers(1000);"
 ${CLICKHOUSE_CLIENT} --query "SELECT sum(value) FROM system.rocksdb WHERE database = currentDatabase() AND table = 'rocksdb_worm' AND name = 'no.file.opens';" # should be 1
@@ -31,6 +38,7 @@ ${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number % 1000, num
 ${CLICKHOUSE_CLIENT} --query "SELECT count() FROM rocksdb_worm;"
 ${CLICKHOUSE_CLIENT} --query "SELECT * FROM rocksdb_worm WHERE key = 0;" # should be the latest value - 999001
 
+
 # Testing insert with multiple threads
 ${CLICKHOUSE_CLIENT} --query "TRUNCATE TABLE rocksdb_worm;"
 ${CLICKHOUSE_CLIENT} --query "INSERT INTO rocksdb_worm SELECT number, number+1 FROM numbers_mt(1000000)" &

From c462d003af0293b2ffaf0de701cd98a099532b9b Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Wed, 1 May 2024 02:11:02 +0000
Subject: [PATCH 172/192] update comments

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
index e3ca2013154..19ce1e3b83e 100644
--- a/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
+++ b/src/Storages/RocksDB/EmbeddedRocksDBBulkSink.h
@@ -23,7 +23,7 @@ using StorageMetadataPtr = std::shared_ptr<const StorageInMemoryMetadata>;
 
 /// Optimized for bulk importing into StorageEmbeddedRocksDB:
 /// 1. No mem-table: an SST file is built from chunk, then import to rocksdb
-/// 2. Overlap compute and IO: one thread prepare rocksdb data from chunk, and another thread to write the data to SST file
+/// 2. Squash chunks to reduce the number of SST files
 class EmbeddedRocksDBBulkSink : public SinkToStorage, public WithContext
 {
 public:

From 936558a690de21f8e44585df0a9f71df8aa8005e Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Wed, 1 May 2024 03:11:59 +0000
Subject: [PATCH 173/192] Update version_date.tsv and changelogs after
 v24.3.3.102-lts

---
 SECURITY.md                          |  3 +-
 docker/keeper/Dockerfile             |  2 +-
 docker/server/Dockerfile.alpine      |  2 +-
 docker/server/Dockerfile.ubuntu      |  2 +-
 docs/changelogs/v24.3.3.102-lts.md   | 76 ++++++++++++++++++++++++++++
 utils/list-versions/version_date.tsv |  2 +
 6 files changed, 83 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelogs/v24.3.3.102-lts.md

diff --git a/SECURITY.md b/SECURITY.md
index 4701f2ec70b..14c39129db9 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -13,9 +13,10 @@ The following versions of ClickHouse server are currently being supported with s
 
 | Version | Supported |
 |:-|:-|
+| 24.4 | ✔️ |
 | 24.3 | ✔️ |
 | 24.2 | ✔️ |
-| 24.1 | ✔️ |
+| 24.1 | ❌ |
 | 23.* | ❌ |
 | 23.8 | ✔️ |
 | 23.7 | ❌ |
diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile
index 346868e19c4..413ad2dfaed 100644
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-keeper"
 ARG DIRECT_DOWNLOAD_URLS=""
 
diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine
index 36f09c092f8..5e224b16764 100644
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 ARG DIRECT_DOWNLOAD_URLS=""
 
diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu
index 531a50efe96..7292163023d 100644
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@@ -27,7 +27,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
 
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 
 # set non-empty deb_location_url url to create a docker image
diff --git a/docs/changelogs/v24.3.3.102-lts.md b/docs/changelogs/v24.3.3.102-lts.md
new file mode 100644
index 00000000000..dc89ac24208
--- /dev/null
+++ b/docs/changelogs/v24.3.3.102-lts.md
@@ -0,0 +1,76 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.3.3.102-lts (7e7f3bdd9be) FIXME as compared to v24.3.2.23-lts (8b7d910960c)
+
+#### Improvement
+* Backported in [#62188](https://github.com/ClickHouse/ClickHouse/issues/62188): StorageJoin with strictness `ANY` is consistent after reload. When several rows with the same key are inserted, the first one will have higher priority (before, it was chosen randomly upon table loading). close [#51027](https://github.com/ClickHouse/ClickHouse/issues/51027). [#61972](https://github.com/ClickHouse/ClickHouse/pull/61972) ([vdimir](https://github.com/vdimir)).
+* Backported in [#62443](https://github.com/ClickHouse/ClickHouse/issues/62443): Client has to send header 'Keep-Alive: timeout=X' to the server. If a client receives a response from the server with that header, client has to use the value from the server. Also for a client it is better not to use a connection which is nearly expired in order to avoid connection close race. [#62249](https://github.com/ClickHouse/ClickHouse/pull/62249) ([Sema Checherinda](https://github.com/CheSema)).
+* Backported in [#62666](https://github.com/ClickHouse/ClickHouse/issues/62666): S3 storage and backups also need the same default keep alive settings as s3 disk. [#62648](https://github.com/ClickHouse/ClickHouse/pull/62648) ([Sema Checherinda](https://github.com/CheSema)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#62013](https://github.com/ClickHouse/ClickHouse/issues/62013): Remove from the Keeper Docker image the volumes at /etc/clickhouse-keeper and /var/log/clickhouse-keeper. [#61683](https://github.com/ClickHouse/ClickHouse/pull/61683) ([Tristan](https://github.com/Tristan971)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Cancel merges before removing moved parts [#61610](https://github.com/ClickHouse/ClickHouse/pull/61610) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Mark CANNOT_PARSE_ESCAPE_SEQUENCE error as parse error to be able to skip it in row input formats [#61883](https://github.com/ClickHouse/ClickHouse/pull/61883) ([Kruglov Pavel](https://github.com/Avogar)).
+* Crash in Engine Merge if Row Policy does not have expression [#61971](https://github.com/ClickHouse/ClickHouse/pull/61971) ([Ilya Golshtein](https://github.com/ilejn)).
+* ReadWriteBufferFromHTTP set right header host when redirected [#62068](https://github.com/ClickHouse/ClickHouse/pull/62068) ([Sema Checherinda](https://github.com/CheSema)).
+* Analyzer: Fix query parameter resolution [#62186](https://github.com/ClickHouse/ClickHouse/pull/62186) ([Dmitry Novik](https://github.com/novikd)).
+* Fixing NULL random seed for generateRandom with analyzer. [#62248](https://github.com/ClickHouse/ClickHouse/pull/62248) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix PartsSplitter [#62268](https://github.com/ClickHouse/ClickHouse/pull/62268) ([Nikita Taranov](https://github.com/nickitat)).
+* Analyzer: Fix alias to parametrized view resolution [#62274](https://github.com/ClickHouse/ClickHouse/pull/62274) ([Dmitry Novik](https://github.com/novikd)).
+* Analyzer: Fix name resolution from parent scopes [#62281](https://github.com/ClickHouse/ClickHouse/pull/62281) ([Dmitry Novik](https://github.com/novikd)).
+* Fix argMax with nullable non native numeric column [#62285](https://github.com/ClickHouse/ClickHouse/pull/62285) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix data race on scalars in Context [#62305](https://github.com/ClickHouse/ClickHouse/pull/62305) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix analyzer with positional arguments in distributed query [#62362](https://github.com/ClickHouse/ClickHouse/pull/62362) ([flynn](https://github.com/ucasfl)).
+* Fix filter pushdown from additional_table_filters in Merge engine in analyzer [#62398](https://github.com/ClickHouse/ClickHouse/pull/62398) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix GLOBAL IN table queries with analyzer. [#62409](https://github.com/ClickHouse/ClickHouse/pull/62409) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix scalar subquery in LIMIT [#62567](https://github.com/ClickHouse/ClickHouse/pull/62567) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Try to fix segfault in Hive engine [#62578](https://github.com/ClickHouse/ClickHouse/pull/62578) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix memory leak in groupArraySorted [#62597](https://github.com/ClickHouse/ClickHouse/pull/62597) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix argMin/argMax combinator state [#62708](https://github.com/ClickHouse/ClickHouse/pull/62708) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix temporary data in cache failing because of cache lock contention optimization [#62715](https://github.com/ClickHouse/ClickHouse/pull/62715) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix FINAL modifier is not respected in CTE with analyzer [#62811](https://github.com/ClickHouse/ClickHouse/pull/62811) ([Duc Canh Le](https://github.com/canhld94)).
+* Fix crash in function `formatRow` with `JSON` format and HTTP  interface [#62840](https://github.com/ClickHouse/ClickHouse/pull/62840) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix GCD codec [#62853](https://github.com/ClickHouse/ClickHouse/pull/62853) ([Nikita Taranov](https://github.com/nickitat)).
+* Disable optimize_rewrite_aggregate_function_with_if for sum(nullable) [#62912](https://github.com/ClickHouse/ClickHouse/pull/62912) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix temporary data in cache incorrectly processing failure of cache key directory creation [#62925](https://github.com/ClickHouse/ClickHouse/pull/62925) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix optimize_rewrite_aggregate_function_with_if implicit cast [#62999](https://github.com/ClickHouse/ClickHouse/pull/62999) ([Raúl Marín](https://github.com/Algunenano)).
+* Do not remove server constants from GROUP BY key for secondary query. [#63047](https://github.com/ClickHouse/ClickHouse/pull/63047) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix incorrect judgement of of monotonicity of function abs [#63097](https://github.com/ClickHouse/ClickHouse/pull/63097) ([Duc Canh Le](https://github.com/canhld94)).
+* Set server name for SSL handshake in MongoDB engine [#63122](https://github.com/ClickHouse/ClickHouse/pull/63122) ([Alexander Gololobov](https://github.com/davenger)).
+* Use user specified db instead of "config" for MongoDB wire protocol version check [#63126](https://github.com/ClickHouse/ClickHouse/pull/63126) ([Alexander Gololobov](https://github.com/davenger)).
+* Format SQL security option only in `CREATE VIEW` queries. [#63136](https://github.com/ClickHouse/ClickHouse/pull/63136) ([pufit](https://github.com/pufit)).
+
+#### CI Fix or Improvement (changelog entry is not required)
+
+* Backported in [#62802](https://github.com/ClickHouse/ClickHouse/issues/62802): We won't fail the job when GH fails to retrieve the job ID and URLs. [#62651](https://github.com/ClickHouse/ClickHouse/pull/62651) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Backported in [#62834](https://github.com/ClickHouse/ClickHouse/issues/62834): Add `isort` config fo the first-party imports; fail build reports on non-success statuses. [#62786](https://github.com/ClickHouse/ClickHouse/pull/62786) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Backported in [#62879](https://github.com/ClickHouse/ClickHouse/issues/62879):. [#62835](https://github.com/ClickHouse/ClickHouse/pull/62835) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Backported in [#62971](https://github.com/ClickHouse/ClickHouse/issues/62971):. [#62932](https://github.com/ClickHouse/ClickHouse/pull/62932) ([Robert Schulze](https://github.com/rschu1ze)).
+* Backported in [#63064](https://github.com/ClickHouse/ClickHouse/issues/63064): ... [#63035](https://github.com/ClickHouse/ClickHouse/pull/63035) ([Aleksei Filatov](https://github.com/aalexfvk)).
+* Backported in [#63117](https://github.com/ClickHouse/ClickHouse/issues/63117): ... [#63108](https://github.com/ClickHouse/ClickHouse/pull/63108) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
+#### NO CL CATEGORY
+
+* Backported in [#62572](https://github.com/ClickHouse/ClickHouse/issues/62572):. [#62549](https://github.com/ClickHouse/ClickHouse/pull/62549) ([Alexander Tokmakov](https://github.com/tavplubix)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Fix another logical error in group_by_use_nulls. [#62236](https://github.com/ClickHouse/ClickHouse/pull/62236) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix lambda(tuple(x), x + 1) syntax in analyzer [#62253](https://github.com/ClickHouse/ClickHouse/pull/62253) ([vdimir](https://github.com/vdimir)).
+* Fix __actionName, add tests for internal functions direct call [#62287](https://github.com/ClickHouse/ClickHouse/pull/62287) ([vdimir](https://github.com/vdimir)).
+* Fix optimize_uniq_to_count when only prefix of key is matched [#62325](https://github.com/ClickHouse/ClickHouse/pull/62325) ([vdimir](https://github.com/vdimir)).
+* Analyzer: Fix PREWHERE with lambda functions [#62336](https://github.com/ClickHouse/ClickHouse/pull/62336) ([vdimir](https://github.com/vdimir)).
+* Use function isNotDistinctFrom only in join key [#62387](https://github.com/ClickHouse/ClickHouse/pull/62387) ([vdimir](https://github.com/vdimir)).
+* Fix integration-tests logs compression [#62556](https://github.com/ClickHouse/ClickHouse/pull/62556) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Try to fix if_transform_strings_to_enum performance test [#62558](https://github.com/ClickHouse/ClickHouse/pull/62558) ([Dmitry Novik](https://github.com/novikd)).
+* Fix shellcheck style checking and issues [#62761](https://github.com/ClickHouse/ClickHouse/pull/62761) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Fix flaky 03128_argMin_combinator_projection [#62965](https://github.com/ClickHouse/ClickHouse/pull/62965) ([Raúl Marín](https://github.com/Algunenano)).
+
diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 375017c0cb0..625ca4fd88f 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,3 +1,5 @@
+v24.4.1.2088-stable	2024-05-01
+v24.3.3.102-lts	2024-05-01
 v24.3.2.23-lts	2024-04-03
 v24.3.1.2672-lts	2024-03-27
 v24.2.2.71-stable	2024-03-15

From b9e71f4f0a94270bcb8cf6c121b1f76910099423 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Wed, 1 May 2024 03:20:05 +0000
Subject: [PATCH 174/192] Update version_date.tsv and changelogs after
 v24.2.3.70-stable

---
 SECURITY.md                          |  3 +-
 docker/keeper/Dockerfile             |  2 +-
 docker/server/Dockerfile.alpine      |  2 +-
 docker/server/Dockerfile.ubuntu      |  2 +-
 docs/changelogs/v24.2.3.70-stable.md | 60 ++++++++++++++++++++++++++++
 utils/list-versions/version_date.tsv |  3 ++
 6 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelogs/v24.2.3.70-stable.md

diff --git a/SECURITY.md b/SECURITY.md
index 4701f2ec70b..14c39129db9 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -13,9 +13,10 @@ The following versions of ClickHouse server are currently being supported with s
 
 | Version | Supported |
 |:-|:-|
+| 24.4 | ✔️ |
 | 24.3 | ✔️ |
 | 24.2 | ✔️ |
-| 24.1 | ✔️ |
+| 24.1 | ❌ |
 | 23.* | ❌ |
 | 23.8 | ✔️ |
 | 23.7 | ❌ |
diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile
index 346868e19c4..413ad2dfaed 100644
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-keeper"
 ARG DIRECT_DOWNLOAD_URLS=""
 
diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine
index 36f09c092f8..5e224b16764 100644
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \
 # lts / testing / prestable / etc
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 ARG DIRECT_DOWNLOAD_URLS=""
 
diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu
index 531a50efe96..7292163023d 100644
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@@ -27,7 +27,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
 
 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION="24.3.2.23"
+ARG VERSION="24.4.1.2088"
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
 
 # set non-empty deb_location_url url to create a docker image
diff --git a/docs/changelogs/v24.2.3.70-stable.md b/docs/changelogs/v24.2.3.70-stable.md
new file mode 100644
index 00000000000..cd88877e254
--- /dev/null
+++ b/docs/changelogs/v24.2.3.70-stable.md
@@ -0,0 +1,60 @@
+---
+sidebar_position: 1
+sidebar_label: 2024
+---
+
+# 2024 Changelog
+
+### ClickHouse release v24.2.3.70-stable (8a7b8c7afb6) FIXME as compared to v24.2.2.71-stable (9293d361e72)
+
+#### Performance Improvement
+* Backported in [#61630](https://github.com/ClickHouse/ClickHouse/issues/61630): Optimized function `dotProduct` to omit unnecessary and expensive memory copies. [#60928](https://github.com/ClickHouse/ClickHouse/pull/60928) ([Robert Schulze](https://github.com/rschu1ze)).
+
+#### Build/Testing/Packaging Improvement
+* Backported in [#62011](https://github.com/ClickHouse/ClickHouse/issues/62011): Remove from the Keeper Docker image the volumes at /etc/clickhouse-keeper and /var/log/clickhouse-keeper. [#61683](https://github.com/ClickHouse/ClickHouse/pull/61683) ([Tristan](https://github.com/Tristan971)).
+
+#### Bug Fix (user-visible misbehavior in an official stable release)
+
+* Fix possible incorrect result of aggregate function `uniqExact` [#61257](https://github.com/ClickHouse/ClickHouse/pull/61257) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix ATTACH query with external ON CLUSTER [#61365](https://github.com/ClickHouse/ClickHouse/pull/61365) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix consecutive keys optimization for nullable keys [#61393](https://github.com/ClickHouse/ClickHouse/pull/61393) ([Anton Popov](https://github.com/CurtizJ)).
+* fix issue of actions dag split [#61458](https://github.com/ClickHouse/ClickHouse/pull/61458) ([Raúl Marín](https://github.com/Algunenano)).
+* Disable async_insert_use_adaptive_busy_timeout correctly with compatibility settings [#61468](https://github.com/ClickHouse/ClickHouse/pull/61468) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix bug when reading system.parts using UUID (issue 61220). [#61479](https://github.com/ClickHouse/ClickHouse/pull/61479) ([Dan Wu](https://github.com/wudanzy)).
+* Fix ALTER QUERY MODIFY SQL SECURITY [#61480](https://github.com/ClickHouse/ClickHouse/pull/61480) ([pufit](https://github.com/pufit)).
+* Fix client `-s` argument [#61530](https://github.com/ClickHouse/ClickHouse/pull/61530) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Fix string search with const position [#61547](https://github.com/ClickHouse/ClickHouse/pull/61547) ([Antonio Andelic](https://github.com/antonio2368)).
+* Cancel merges before removing moved parts [#61610](https://github.com/ClickHouse/ClickHouse/pull/61610) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+* Fix crash in `multiSearchAllPositionsCaseInsensitiveUTF8` for incorrect UTF-8 [#61749](https://github.com/ClickHouse/ClickHouse/pull/61749) ([pufit](https://github.com/pufit)).
+* Mark CANNOT_PARSE_ESCAPE_SEQUENCE error as parse error to be able to skip it in row input formats [#61883](https://github.com/ClickHouse/ClickHouse/pull/61883) ([Kruglov Pavel](https://github.com/Avogar)).
+* Crash in Engine Merge if Row Policy does not have expression [#61971](https://github.com/ClickHouse/ClickHouse/pull/61971) ([Ilya Golshtein](https://github.com/ilejn)).
+* Fix data race on scalars in Context [#62305](https://github.com/ClickHouse/ClickHouse/pull/62305) ([Kruglov Pavel](https://github.com/Avogar)).
+* Try to fix segfault in Hive engine [#62578](https://github.com/ClickHouse/ClickHouse/pull/62578) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Fix memory leak in groupArraySorted [#62597](https://github.com/ClickHouse/ClickHouse/pull/62597) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix GCD codec [#62853](https://github.com/ClickHouse/ClickHouse/pull/62853) ([Nikita Taranov](https://github.com/nickitat)).
+* Fix temporary data in cache incorrectly processing failure of cache key directory creation [#62925](https://github.com/ClickHouse/ClickHouse/pull/62925) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix incorrect judgement of of monotonicity of function abs [#63097](https://github.com/ClickHouse/ClickHouse/pull/63097) ([Duc Canh Le](https://github.com/canhld94)).
+* Make sanity check of settings worse [#63119](https://github.com/ClickHouse/ClickHouse/pull/63119) ([Raúl Marín](https://github.com/Algunenano)).
+* Set server name for SSL handshake in MongoDB engine [#63122](https://github.com/ClickHouse/ClickHouse/pull/63122) ([Alexander Gololobov](https://github.com/davenger)).
+* Format SQL security option only in `CREATE VIEW` queries. [#63136](https://github.com/ClickHouse/ClickHouse/pull/63136) ([pufit](https://github.com/pufit)).
+
+#### CI Fix or Improvement (changelog entry is not required)
+
+* Backported in [#61430](https://github.com/ClickHouse/ClickHouse/issues/61430):. [#61374](https://github.com/ClickHouse/ClickHouse/pull/61374) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Backported in [#61490](https://github.com/ClickHouse/ClickHouse/issues/61490): ... [#61441](https://github.com/ClickHouse/ClickHouse/pull/61441) ([Max K.](https://github.com/maxknv)).
+* Backported in [#61638](https://github.com/ClickHouse/ClickHouse/issues/61638):. [#61592](https://github.com/ClickHouse/ClickHouse/pull/61592) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Backported in [#61896](https://github.com/ClickHouse/ClickHouse/issues/61896): ... [#61877](https://github.com/ClickHouse/ClickHouse/pull/61877) ([Max K.](https://github.com/maxknv)).
+* Backported in [#62055](https://github.com/ClickHouse/ClickHouse/issues/62055): ... [#62044](https://github.com/ClickHouse/ClickHouse/pull/62044) ([Max K.](https://github.com/maxknv)).
+* Backported in [#62203](https://github.com/ClickHouse/ClickHouse/issues/62203):. [#62190](https://github.com/ClickHouse/ClickHouse/pull/62190) ([Konstantin Bogdanov](https://github.com/thevar1able)).
+* Backported in [#62800](https://github.com/ClickHouse/ClickHouse/issues/62800): We won't fail the job when GH fails to retrieve the job ID and URLs. [#62651](https://github.com/ClickHouse/ClickHouse/pull/62651) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* Backported in [#62970](https://github.com/ClickHouse/ClickHouse/issues/62970):. [#62932](https://github.com/ClickHouse/ClickHouse/pull/62932) ([Robert Schulze](https://github.com/rschu1ze)).
+* Backported in [#63115](https://github.com/ClickHouse/ClickHouse/issues/63115): ... [#63108](https://github.com/ClickHouse/ClickHouse/pull/63108) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+
+#### NO CL ENTRY
+
+* NO CL ENTRY:  'Revert "Backport [#61479](https://github.com/ClickHouse/ClickHouse/issues/61479) to 24.2: Fix bug when reading system.parts using UUID (issue 61220)."'. [#61776](https://github.com/ClickHouse/ClickHouse/pull/61776) ([János Benjamin Antal](https://github.com/antaljanosbenjamin)).
+
+#### NOT FOR CHANGELOG / INSIGNIFICANT
+
+* Throw on query timeout in ZooKeeperRetries [#60922](https://github.com/ClickHouse/ClickHouse/pull/60922) ([Antonio Andelic](https://github.com/antonio2368)).
+
diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 375017c0cb0..2389d661786 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,5 +1,8 @@
+v24.4.1.2088-stable	2024-05-01
+v24.3.3.102-lts	2024-05-01
 v24.3.2.23-lts	2024-04-03
 v24.3.1.2672-lts	2024-03-27
+v24.2.3.70-stable	2024-05-01
 v24.2.2.71-stable	2024-03-15
 v24.2.1.2248-stable	2024-02-29
 v24.1.8.22-stable	2024-03-26

From deb47060564c9ffe6cabd4c386d45455eb20bf5b Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Wed, 1 May 2024 11:19:51 +0200
Subject: [PATCH 175/192] Review fixes

---
 src/Common/ProfileEvents.cpp         |  2 ++
 src/Interpreters/Cache/FileCache.cpp | 19 +++++++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index c00d1017586..ed0b29c7b44 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -489,6 +489,8 @@ The server successfully detected this situation and will download merged part fr
     M(FilesystemCacheFailToReserveSpaceBecauseOfLockContention, "Number of times space reservation was skipped due to a high contention on the cache lock") \
     M(FilesystemCacheHoldFileSegments, "Filesystem cache file segments count, which were hold") \
     M(FilesystemCacheUnusedHoldFileSegments, "Filesystem cache file segments count, which were hold, but not used (because of seek or LIMIT n, etc)") \
+    M(FilesystemCacheFreeSpaceKeepingThreadRun, "Number of times background thread executed free space keeping job") \
+    M(FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds, "Time for which background thread executed free space keeping job") \
     \
     M(RemoteFSSeeks, "Total number of seeks for async buffer") \
     M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem") \
diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 0876ce8e398..1ded737941c 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -28,6 +28,8 @@ namespace ProfileEvents
     extern const Event FilesystemCacheGetOrSetMicroseconds;
     extern const Event FilesystemCacheGetMicroseconds;
     extern const Event FilesystemCacheFailToReserveSpaceBecauseOfLockContention;
+    extern const Event FilesystemCacheFreeSpaceKeepingThreadRun;
+    extern const Event FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds;
 }
 
 namespace DB
@@ -990,6 +992,8 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
         return;
     }
 
+    ProfileEvents::increment(ProfileEvents::FilesystemCacheFreeSpaceKeepingThreadRun);
+
     FileCacheReserveStat stat;
     EvictionCandidates eviction_candidates;
 
@@ -1009,14 +1013,12 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
             const auto current_size = main_priority->getSize(lock);
             chassert(current_size >= stat.total_stat.releasable_size);
             chassert(!size_limit
-                        || current_size <= desired_size
-                        || current_size - stat.total_stat.releasable_size <= desired_size);
+                     || current_size - stat.total_stat.releasable_size <= desired_size);
 
             const auto current_elements_count = main_priority->getElementsCount(lock);
             chassert(current_elements_count >= stat.total_stat.releasable_count);
             chassert(!elements_limit
-                        || current_elements_count <= desired_elements_num
-                        || current_elements_count - stat.total_stat.releasable_count <= desired_elements_num);
+                     || current_elements_count - stat.total_stat.releasable_count <= desired_elements_num);
         }
 #endif
 
@@ -1057,12 +1059,17 @@ void FileCache::freeSpaceRatioKeepingThreadFunc()
         chassert(false);
     }
 
+    watch.stop();
+    ProfileEvents::increment(ProfileEvents::FilesystemCacheFreeSpaceKeepingThreadWorkMilliseconds, watch.elapsedMilliseconds());
+
     LOG_TRACE(log, "Free space ratio keeping thread finished in {} ms", watch.elapsedMilliseconds());
 
+    [[maybe_unused]] bool scheduled = false;
     if (limits_satisfied)
-        keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
+        scheduled = keep_up_free_space_ratio_task->scheduleAfter(general_reschedule_ms);
     else
-        keep_up_free_space_ratio_task->schedule();
+        scheduled = keep_up_free_space_ratio_task->schedule();
+    chassert(scheduled);
 }
 
 void FileCache::iterate(IterateFunc && func, const UserID & user_id)

From a779f76b591c8e8faa89777a164387d901e19049 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Wed, 1 May 2024 14:56:49 +0200
Subject: [PATCH 176/192] Hide CI options under a spoiler

---
 .github/PULL_REQUEST_TEMPLATE.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 115857a42d2..950e672272a 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -41,8 +41,9 @@ At a minimum, the following information should be added (but add more as needed)
 
 > Information about CI checks: https://clickhouse.com/docs/en/development/continuous-integration/
 
----
-### Modify your CI run:
+<details>
+    <summary>Modify your CI run</summary>
+
 **NOTE:** If your merge the PR with modified CI you **MUST KNOW** what you are doing
 **NOTE:** Checked options will be applied if set before CI RunConfig/PrepareRunConfig step
 
@@ -83,3 +84,5 @@ At a minimum, the following information should be added (but add more as needed)
 - [ ] <!---batch_1--> 2
 - [ ] <!---batch_2--> 3
 - [ ] <!---batch_3--> 4
+
+<details>

From fec4c5793423abaa1da55749e03af890d259b1e5 Mon Sep 17 00:00:00 2001
From: Alexander Sapin <alesapin@gmail.com>
Date: Wed, 1 May 2024 15:03:52 +0200
Subject: [PATCH 177/192] Add run with msan

---
 tests/ci/ci_config.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index 28a841fe0fa..a47c30de674 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -131,6 +131,7 @@ class JobNames(metaclass=WithIter):
     STRESS_TEST_MSAN = "Stress test (msan)"
     STRESS_TEST_DEBUG = "Stress test (debug)"
     STRESS_TEST_AZURE_TSAN = "Stress test (azure, tsan)"
+    STRESS_TEST_AZURE_MSAN = "Stress test (azure, msan)"
 
     INTEGRATION_TEST = "Integration tests (release)"
     INTEGRATION_TEST_ASAN = "Integration tests (asan)"
@@ -1234,6 +1235,9 @@ CI_CONFIG = CIConfig(
         JobNames.STRESS_TEST_AZURE_TSAN: TestConfig(
             Build.PACKAGE_TSAN, job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True)  # type: ignore
         ),
+        JobNames.STRESS_TEST_AZURE_MSAN: TestConfig(
+            Build.PACKAGE_MSAN, job_config=JobConfig(**stress_test_common_params, release_only=True, run_by_ci_option=True)  # type: ignore
+        ),
         JobNames.UPGRADE_TEST_TSAN: TestConfig(
             Build.PACKAGE_TSAN, job_config=JobConfig(pr_only=True, random_bucket="upgrade_with_sanitizer", **upgrade_test_common_params)  # type: ignore
         ),

From 9ad263b5ed2e9f9ae6dfc330d48a6215a8a2ed7b Mon Sep 17 00:00:00 2001
From: Alexander Sapin <alesapin@gmail.com>
Date: Wed, 1 May 2024 15:56:17 +0200
Subject: [PATCH 178/192] Fxi

---
 tests/ci/ci.py              | 23 +++++++++++++------
 tests/ci/test_ci_options.py | 45 ++++++++++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/tests/ci/ci.py b/tests/ci/ci.py
index 0bb12e54606..a95ada628ff 100644
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@@ -849,7 +849,6 @@ class CiOptions:
         jobs_to_do: List[str],
         jobs_to_skip: List[str],
         jobs_params: Dict[str, Dict[str, Any]],
-        run_only_if_included: bool,
     ) -> Tuple[List[str], List[str], Dict[str, Dict[str, Any]]]:
         """
         Applies specified options on CI Run Config
@@ -932,10 +931,6 @@ class CiOptions:
                 )
             jobs_to_do_requested = list(label_config.run_jobs)
 
-        if run_only_if_included and not jobs_to_do_requested:
-            jobs_to_skip += jobs_to_do
-            jobs_to_do = []
-
         if jobs_to_do_requested:
             jobs_to_do_requested = list(set(jobs_to_do_requested))
             print(
@@ -948,10 +943,12 @@ class CiOptions:
             #   we need to add params - otherwise it won't run as "batches" list will be empty
             for job in jobs_to_do:
                 if job not in jobs_params:
-                    num_batches = CI_CONFIG.get_job_config(job).num_batches
+                    job_config = CI_CONFIG.get_job_config(job)
+                    num_batches = job_config.num_batches
                     jobs_params[job] = {
                         "batches": list(range(num_batches)),
                         "num_batches": num_batches,
+                        "run_if_ci_option_include_set": job_config.run_by_ci_option,
                     }
 
         # 4. Handle "batch_" tags
@@ -963,6 +960,18 @@ class CiOptions:
                 if params["num_batches"] > 1:
                     params["batches"] = self.job_batches
 
+        for job in jobs_to_do[:]:
+            job_param = jobs_params[job]
+            if (
+                job_param["run_if_ci_option_include_set"]
+                and job not in jobs_to_do_requested
+            ):
+                print(
+                    f"Erasing job '{job}' from list because it's not in included set, but will run only by include"
+                )
+                jobs_to_skip.append(job)
+                jobs_to_do.remove(job)
+
         return jobs_to_do, jobs_to_skip, jobs_params
 
 
@@ -1455,7 +1464,7 @@ def _configure_jobs(
                 ]
 
     jobs_to_do, jobs_to_skip, jobs_params = ci_options.apply(
-        jobs_to_do, jobs_to_skip, jobs_params, job_config.run_by_ci_option
+        jobs_to_do, jobs_to_skip, jobs_params
     )
 
     return {
diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py
index 9e4f486aa29..8817783b7e9 100644
--- a/tests/ci/test_ci_options.py
+++ b/tests/ci/test_ci_options.py
@@ -44,6 +44,10 @@ _TEST_BODY_3 = """
 - [x] <!---ci_include_analyzer--> Must include all tests for analyzer
 """
 
+_TEST_BODY_4 = """
+"""
+
+
 _TEST_JOB_LIST = [
     "Style check",
     "Fast test",
@@ -152,9 +156,15 @@ class TestCIOptions(unittest.TestCase):
         )
         jobs_to_do = list(_TEST_JOB_LIST)
         jobs_to_skip = []
-        job_params = {}
+        job_params = {
+            "Stateless tests (azure, asan)": {
+                "batches": list(range(3)),
+                "num_batches": 3,
+                "run_if_ci_option_include_set": True,
+            }
+        }
         jobs_to_do, jobs_to_skip, job_params = ci_options.apply(
-            jobs_to_do, jobs_to_skip, job_params, False
+            jobs_to_do, jobs_to_skip, job_params
         )
         self.assertCountEqual(
             jobs_to_do,
@@ -186,7 +196,7 @@ class TestCIOptions(unittest.TestCase):
         jobs_to_skip = []
         job_params = {}
         jobs_to_do, jobs_to_skip, job_params = ci_options.apply(
-            jobs_to_do, jobs_to_skip, job_params, False
+            jobs_to_do, jobs_to_skip, job_params
         )
         self.assertCountEqual(
             jobs_to_do,
@@ -198,3 +208,32 @@ class TestCIOptions(unittest.TestCase):
                 "package_asan",
             ],
         )
+
+    def test_options_applied_2(self):
+        self.maxDiff = None
+        ci_options = CiOptions.create_from_pr_message(
+            _TEST_BODY_4, update_from_api=False
+        )
+        self.assertIsNone(ci_options.include_keywords, None)
+        self.assertIsNone(ci_options.exclude_keywords, None)
+        jobs_to_do = list(_TEST_JOB_LIST)
+        jobs_to_skip = []
+        job_params = {}
+
+        for job in _TEST_JOB_LIST:
+            if "Stateless" in job:
+                job_params[job] = {
+                    "batches": list(range(3)),
+                    "num_batches": 3,
+                    "run_if_ci_option_include_set": True if "azure" in job else False,
+                }
+            else:
+                job_params[job] = {"run_if_ci_option_include_set": False}
+
+        jobs_to_do, jobs_to_skip, job_params = ci_options.apply(
+            jobs_to_do, jobs_to_skip, job_params
+        )
+        self.assertNotIn(
+            "Stateless tests (azure, asan)",
+            jobs_to_do,
+        )

From 4e1470ca56526be7c8e1ed5c39b0a5e4f2703036 Mon Sep 17 00:00:00 2001
From: Alexander Sapin <alesapin@gmail.com>
Date: Wed, 1 May 2024 16:29:01 +0200
Subject: [PATCH 179/192] Fxi

---
 tests/ci/test_ci_options.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py
index 8817783b7e9..d41ed282abf 100644
--- a/tests/ci/test_ci_options.py
+++ b/tests/ci/test_ci_options.py
@@ -209,7 +209,7 @@ class TestCIOptions(unittest.TestCase):
             ],
         )
 
-    def test_options_applied_2(self):
+    def test_options_applied_3(self):
         self.maxDiff = None
         ci_options = CiOptions.create_from_pr_message(
             _TEST_BODY_4, update_from_api=False

From 5b0ab901999bba1f356c31697550cb727ab37a3e Mon Sep 17 00:00:00 2001
From: Alexander Sapin <alesapin@gmail.com>
Date: Wed, 1 May 2024 16:29:55 +0200
Subject: [PATCH 180/192] Fxi

---
 tests/ci/test_ci_options.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py
index d41ed282abf..3d9c02822bd 100644
--- a/tests/ci/test_ci_options.py
+++ b/tests/ci/test_ci_options.py
@@ -225,7 +225,7 @@ class TestCIOptions(unittest.TestCase):
                 job_params[job] = {
                     "batches": list(range(3)),
                     "num_batches": 3,
-                    "run_if_ci_option_include_set": True if "azure" in job else False,
+                    "run_if_ci_option_include_set": "azure" in job,
                 }
             else:
                 job_params[job] = {"run_if_ci_option_include_set": False}

From 812153560b5e94c188889c9a55befd90ef498dbd Mon Sep 17 00:00:00 2001
From: Alexander Sapin <alesapin@gmail.com>
Date: Wed, 1 May 2024 16:44:00 +0200
Subject: [PATCH 181/192] Bump


From 4ff7558e4f567d82cbea91a5409dc6219961ed49 Mon Sep 17 00:00:00 2001
From: Ladislav Snizek <ladislav.snizek@cdn77.com>
Date: Wed, 1 May 2024 17:50:29 +0200
Subject: [PATCH 182/192] Correct the name of
 memory_usage_overcommit_max_wait_microseconds in documentation

---
 docs/en/operations/settings/memory-overcommit.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/settings/memory-overcommit.md b/docs/en/operations/settings/memory-overcommit.md
index 43a7784e1ed..e13cb263b4d 100644
--- a/docs/en/operations/settings/memory-overcommit.md
+++ b/docs/en/operations/settings/memory-overcommit.md
@@ -37,4 +37,4 @@ In this case overcommit ratio is computed as number of allocated bytes divided b
 
 If `memory_overcommit_ratio_denominator_for_user` for the query is equals to zero, overcommit tracker won't choose this query.
 
-Waiting timeout is set by `global_memory_usage_overcommit_max_wait_microseconds` parameter in the configuration file.
+Waiting timeout is set by `memory_usage_overcommit_max_wait_microseconds` parameter in the configuration file.

From 544af8cd477ca6ac87b1b1473f45a16378791ab2 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Thu, 2 May 2024 00:01:55 +0300
Subject: [PATCH 183/192] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 767d353d586..207b88f7860 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,7 +7,7 @@
 
 # 2024 Changelog
 
-### <a id="244"></a> ClickHouse release 24.4 LTS, 2024-04-30
+### <a id="244"></a> ClickHouse release 24.4, 2024-04-30
 
 #### Upgrade Notes
 * `clickhouse-odbc-bridge` and `clickhouse-library-bridge` are now separate packages. This closes [#61677](https://github.com/ClickHouse/ClickHouse/issues/61677). [#62114](https://github.com/ClickHouse/ClickHouse/pull/62114) ([Alexey Milovidov](https://github.com/alexey-milovidov)).

From ea02af6e9204369e00d3f2ce62c40d28b4394037 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mattias=20Naarttij=C3=A4rvi?=
 <Naartti@users.noreply.github.com>
Date: Thu, 2 May 2024 08:49:38 +0200
Subject: [PATCH 184/192] Update date-time-functions.md

Fix broken link to `toStartOfInterval`.
---
 docs/en/sql-reference/functions/date-time-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 39971cbed7a..2cbcc193dd6 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -1891,7 +1891,7 @@ Result:
 
 **See Also**
 
-- [toStartOfInterval](#tostartofintervaltime-or-data-interval-x-unit-time-zone)
+- [toStartOfInterval](#tostartofintervaldate_or_date_with_time-interval-x-unit--time_zone)
 
 ## date\_add
 

From 7aab5855b68247d57aef5e1d119ebe0d7d7971e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Thu, 2 May 2024 07:00:40 +0000
Subject: [PATCH 185/192] Fix links

---
 .../settings.md                               |  2 +-
 .../operations/settings/settings-formats.md   |  2 +-
 docs/en/operations/system-tables/grants.md    |  2 +-
 .../functions/other-functions.md              |  6 +--
 .../en/sql-reference/statements/alter/user.md |  4 +-
 .../sql-reference/statements/create/user.md   |  2 +-
 docs/en/sql-reference/statements/grant.md     | 48 +++++++++----------
 docs/en/sql-reference/statements/show.md      |  4 +-
 8 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index c839ea3ae5a..28831404a1f 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -2860,7 +2860,7 @@ table functions, and dictionaries.
 User wishing to see secrets must also have
 [`format_display_secrets_in_show_and_select` format setting](../settings/formats#format_display_secrets_in_show_and_select)
 turned on and a
-[`displaySecretsInShowAndSelect`](../../sql-reference/statements/grant#grant-display-secrets) privilege.
+[`displaySecretsInShowAndSelect`](../../sql-reference/statements/grant#display-secrets) privilege.
 
 Possible values:
 
diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md
index f455fcba840..6666f68c177 100644
--- a/docs/en/operations/settings/settings-formats.md
+++ b/docs/en/operations/settings/settings-formats.md
@@ -15,7 +15,7 @@ table functions, and dictionaries.
 User wishing to see secrets must also have
 [`display_secrets_in_show_and_select` server setting](../server-configuration-parameters/settings#display_secrets_in_show_and_select)
 turned on and a
-[`displaySecretsInShowAndSelect`](../../sql-reference/statements/grant#grant-display-secrets) privilege.
+[`displaySecretsInShowAndSelect`](../../sql-reference/statements/grant#display-secrets) privilege.
 
 Possible values:
 
diff --git a/docs/en/operations/system-tables/grants.md b/docs/en/operations/system-tables/grants.md
index b12f656cb75..262a53a87a5 100644
--- a/docs/en/operations/system-tables/grants.md
+++ b/docs/en/operations/system-tables/grants.md
@@ -22,4 +22,4 @@ Columns:
 - `0` — The row describes a partial revoke.
 - `1` — The row describes a grant.
 
-- `grant_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Permission is granted `WITH GRANT OPTION`, see [GRANT](../../sql-reference/statements/grant.md#grant-privigele-syntax).
+- `grant_option` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Permission is granted `WITH GRANT OPTION`, see [GRANT](../../sql-reference/statements/grant.md#granting-privilege-syntax).
diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md
index 694a69b64cc..ef20cf808f1 100644
--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@@ -556,7 +556,7 @@ hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘
 **Parameters**
 
 - `database` : name of the database. [String literal](../syntax#syntax-string-literal)
-- `table` : name of the table. [String literal](../syntax#syntax-string-literal) 
+- `table` : name of the table. [String literal](../syntax#syntax-string-literal)
 - `column` : name of the column. [String literal](../syntax#syntax-string-literal)
 - `hostname` : remote server name to perform the check on. [String literal](../syntax#syntax-string-literal)
 - `username` : username for remote server. [String literal](../syntax#syntax-string-literal)
@@ -565,7 +565,7 @@ hasColumnInTable(\[‘hostname’\[, ‘username’\[, ‘password’\]\],\] ‘
 **Returned value**
 
 - `1` if the given column exists.
-- `0`, otherwise. 
+- `0`, otherwise.
 
 **Implementation details**
 
@@ -2444,7 +2444,7 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere
 
 ## defaultRoles
 
-Returns the roles which are enabled by default for the current user when he logs in. Initially these are all roles granted to the current user (see [GRANT](../../sql-reference/statements/grant.md#grant-select)), but that can be changed with the [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement) statement.
+Returns the roles which are enabled by default for the current user when he logs in. Initially these are all roles granted to the current user (see [GRANT](../../sql-reference/statements/grant.md#select)), but that can be changed with the [SET DEFAULT ROLE](../../sql-reference/statements/set-role.md#set-default-role-statement) statement.
 
 **Syntax**
 
diff --git a/docs/en/sql-reference/statements/alter/user.md b/docs/en/sql-reference/statements/alter/user.md
index fd7da05167c..b5c156f56a9 100644
--- a/docs/en/sql-reference/statements/alter/user.md
+++ b/docs/en/sql-reference/statements/alter/user.md
@@ -20,11 +20,11 @@ ALTER USER [IF EXISTS] name1 [ON CLUSTER cluster_name1] [RENAME TO new_name1]
     [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY | WRITABLE] | PROFILE 'profile_name'] [,...]
 ```
 
-To use `ALTER USER` you must have the [ALTER USER](../../../sql-reference/statements/grant.md#grant-access-management) privilege.
+To use `ALTER USER` you must have the [ALTER USER](../../../sql-reference/statements/grant.md#access-management) privilege.
 
 ## GRANTEES Clause
 
-Specifies users or roles which are allowed to receive [privileges](../../../sql-reference/statements/grant.md#grant-privileges) from this user on the condition this user has also all required access granted with [GRANT OPTION](../../../sql-reference/statements/grant.md#grant-privigele-syntax). Options of the `GRANTEES` clause:
+Specifies users or roles which are allowed to receive [privileges](../../../sql-reference/statements/grant.md#privileges) from this user on the condition this user has also all required access granted with [GRANT OPTION](../../../sql-reference/statements/grant.md#granting-privilege-syntax). Options of the `GRANTEES` clause:
 
 - `user` — Specifies a user this user can grant privileges to.
 - `role` — Specifies a role this user can grant privileges to.
diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md
index 96f75908c35..aee98cfcd10 100644
--- a/docs/en/sql-reference/statements/create/user.md
+++ b/docs/en/sql-reference/statements/create/user.md
@@ -177,7 +177,7 @@ Examples:
 
 ## GRANTEES Clause
 
-Specifies users or roles which are allowed to receive [privileges](../../../sql-reference/statements/grant.md#grant-privileges) from this user on the condition this user has also all required access granted with [GRANT OPTION](../../../sql-reference/statements/grant.md#grant-privigele-syntax). Options of the `GRANTEES` clause:
+Specifies users or roles which are allowed to receive [privileges](../../../sql-reference/statements/grant.md#privileges) from this user on the condition this user has also all required access granted with [GRANT OPTION](../../../sql-reference/statements/grant.md#granting-privilege-syntax). Options of the `GRANTEES` clause:
 
 - `user` — Specifies a user this user can grant privileges to.
 - `role` — Specifies a role this user can grant privileges to.
diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md
index ccef5066e89..2850ce71781 100644
--- a/docs/en/sql-reference/statements/grant.md
+++ b/docs/en/sql-reference/statements/grant.md
@@ -6,7 +6,7 @@ sidebar_label: GRANT
 
 # GRANT Statement
 
-- Grants [privileges](#grant-privileges) to ClickHouse user accounts or roles.
+- Grants [privileges](#privileges) to ClickHouse user accounts or roles.
 - Assigns roles to user accounts or to the other roles.
 
 To revoke privileges, use the [REVOKE](../../sql-reference/statements/revoke.md) statement. Also you can list granted privileges with the [SHOW GRANTS](../../sql-reference/statements/show.md#show-grants) statement.
@@ -82,9 +82,9 @@ Privileges have a hierarchical structure. A set of permitted queries depends on
 
 Hierarchy of privileges:
 
-- [SELECT](#grant-select)
-- [INSERT](#grant-insert)
-- [ALTER](#grant-alter)
+- [SELECT](#select)
+- [INSERT](#insert)
+- [ALTER](#alter)
     - `ALTER TABLE`
         - `ALTER UPDATE`
         - `ALTER DELETE`
@@ -115,7 +115,7 @@ Hierarchy of privileges:
         - `ALTER VIEW REFRESH`
         - `ALTER VIEW MODIFY QUERY`
         - `ALTER VIEW MODIFY SQL SECURITY`
-- [CREATE](#grant-create)
+- [CREATE](#create)
     - `CREATE DATABASE`
     - `CREATE TABLE`
         - `CREATE ARBITRARY TEMPORARY TABLE`
@@ -123,21 +123,21 @@ Hierarchy of privileges:
     - `CREATE VIEW`
     - `CREATE DICTIONARY`
     - `CREATE FUNCTION`
-- [DROP](#grant-drop)
+- [DROP](#drop)
     - `DROP DATABASE`
     - `DROP TABLE`
     - `DROP VIEW`
     - `DROP DICTIONARY`
     - `DROP FUNCTION`
-- [TRUNCATE](#grant-truncate)
-- [OPTIMIZE](#grant-optimize)
-- [SHOW](#grant-show)
+- [TRUNCATE](#truncate)
+- [OPTIMIZE](#optimize)
+- [SHOW](#show)
     - `SHOW DATABASES`
     - `SHOW TABLES`
     - `SHOW COLUMNS`
     - `SHOW DICTIONARIES`
-- [KILL QUERY](#grant-kill-query)
-- [ACCESS MANAGEMENT](#grant-access-management)
+- [KILL QUERY](#kill-query)
+- [ACCESS MANAGEMENT](#access-management)
     - `CREATE USER`
     - `ALTER USER`
     - `DROP USER`
@@ -160,7 +160,7 @@ Hierarchy of privileges:
         - `SHOW_QUOTAS`
         - `SHOW_SETTINGS_PROFILES`
     - `ROLE ADMIN`
-- [SYSTEM](#grant-system)
+- [SYSTEM](#system)
     - `SYSTEM SHUTDOWN`
     - `SYSTEM DROP CACHE`
         - `SYSTEM DROP DNS CACHE`
@@ -186,12 +186,12 @@ Hierarchy of privileges:
         - `SYSTEM FLUSH DISTRIBUTED`
         - `SYSTEM FLUSH LOGS`
     - `CLUSTER` (see also `access_control_improvements.on_cluster_queries_require_cluster_grant` configuration directive)
-- [INTROSPECTION](#grant-introspection)
+- [INTROSPECTION](#introspection)
     - `addressToLine`
     - `addressToLineWithInlines`
     - `addressToSymbol`
     - `demangle`
-- [SOURCES](#grant-sources)
+- [SOURCES](#sources)
     - `FILE`
     - `URL`
     - `REMOTE`
@@ -200,16 +200,16 @@ Hierarchy of privileges:
     - `JDBC`
     - `HDFS`
     - `S3`
-- [dictGet](#grant-dictget)
-- [displaySecretsInShowAndSelect](#grant-display-secrets)
-- [NAMED COLLECTION ADMIN](#grant-named-collection-admin)
+- [dictGet](#dictget)
+- [displaySecretsInShowAndSelect](#display-secrets)
+- [NAMED COLLECTION ADMIN](#named-collection-admin)
     - `CREATE NAMED COLLECTION`
     - `DROP NAMED COLLECTION`
     - `ALTER NAMED COLLECTION`
     - `SHOW NAMED COLLECTIONS`
     - `SHOW NAMED COLLECTIONS SECRETS`
     - `NAMED COLLECTION`
-- [TABLE ENGINE](#grant-table-engine)
+- [TABLE ENGINE](#table-engine)
 
 Examples of how this hierarchy is treated:
 
@@ -238,11 +238,11 @@ Examples of disallowed syntax:
 - `GRANT CREATE USER(x) ON db.table TO user`
 - `GRANT CREATE USER ON db.* TO user`
 
-The special privilege [ALL](#grant-all) grants all the privileges to a user account or a role.
+The special privilege [ALL](#all) grants all the privileges to a user account or a role.
 
 By default, a user account or a role has no privileges.
 
-If a user or a role has no privileges, it is displayed as [NONE](#grant-none) privilege.
+If a user or a role has no privileges, it is displayed as [NONE](#none) privilege.
 
 Some queries by their implementation require a set of privileges. For example, to execute the [RENAME](../../sql-reference/statements/optimize.md) query you need the following privileges: `SELECT`, `CREATE TABLE`, `INSERT` and `DROP TABLE`.
 
@@ -326,8 +326,8 @@ Examples of how this hierarchy is treated:
 **Notes**
 
 - The `MODIFY SETTING` privilege allows modifying table engine settings. It does not affect settings or server configuration parameters.
-- The `ATTACH` operation needs the [CREATE](#grant-create) privilege.
-- The `DETACH` operation needs the [DROP](#grant-drop) privilege.
+- The `ATTACH` operation needs the [CREATE](#create) privilege.
+- The `DETACH` operation needs the [DROP](#drop) privilege.
 - To stop mutation by the [KILL MUTATION](../../sql-reference/statements/kill.md#kill-mutation) query, you need to have a privilege to start this mutation. For example, if you want to stop the `ALTER UPDATE` query, you need the `ALTER UPDATE`, `ALTER TABLE`, or `ALTER` privilege.
 
 ### CREATE
@@ -344,7 +344,7 @@ Allows executing [CREATE](../../sql-reference/statements/create/index.md) and [A
 
 **Notes**
 
-- To delete the created table, a user needs [DROP](#grant-drop).
+- To delete the created table, a user needs [DROP](#drop).
 
 ### DROP
 
@@ -498,7 +498,7 @@ Privilege level: `DICTIONARY`.
 - `GRANT dictGet ON mydictionary TO john`
 
 
-### displaySecretsInShowAndSelect {#grant-display-secrets}
+### displaySecretsInShowAndSelect {#display-secrets}
 
 Allows a user to view secrets in `SHOW` and `SELECT` queries if both
 [`display_secrets_in_show_and_select` server setting](../../operations/server-configuration-parameters/settings#display_secrets_in_show_and_select)
diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md
index 029ca4adf3b..627f17b897d 100644
--- a/docs/en/sql-reference/statements/show.md
+++ b/docs/en/sql-reference/statements/show.md
@@ -11,7 +11,7 @@ N.B. `SHOW CREATE (TABLE|DATABASE|USER)` hides secrets unless
 is turned on,
 [`format_display_secrets_in_show_and_select` format setting](../../operations/settings/formats#format_display_secrets_in_show_and_select)
 is turned on and user has
-[`displaySecretsInShowAndSelect`](grant.md#grant-display-secrets) privilege.
+[`displaySecretsInShowAndSelect`](grant.md#display-secrets) privilege.
 
 ## SHOW CREATE TABLE | DICTIONARY | VIEW | DATABASE
 
@@ -466,7 +466,7 @@ SHOW [CURRENT] QUOTA
 ```
 ## SHOW ACCESS
 
-Shows all [users](../../guides/sre/user-management/index.md#user-account-management), [roles](../../guides/sre/user-management/index.md#role-management), [profiles](../../guides/sre/user-management/index.md#settings-profiles-management), etc. and all their [grants](../../sql-reference/statements/grant.md#grant-privileges).
+Shows all [users](../../guides/sre/user-management/index.md#user-account-management), [roles](../../guides/sre/user-management/index.md#role-management), [profiles](../../guides/sre/user-management/index.md#settings-profiles-management), etc. and all their [grants](../../sql-reference/statements/grant.md#privileges).
 
 **Syntax**
 

From 5520f26f18a41be19c815e232f9bf49d72d2fbe2 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 2 May 2024 08:09:23 +0000
Subject: [PATCH 186/192] Incorporate review feedback

---
 base/base/cgroupsv2.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/base/base/cgroupsv2.cpp b/base/base/cgroupsv2.cpp
index c8081d986a9..bea2e99fa51 100644
--- a/base/base/cgroupsv2.cpp
+++ b/base/base/cgroupsv2.cpp
@@ -23,9 +23,9 @@ bool cgroupsV2MemoryControllerEnabled()
 {
 #if defined(OS_LINUX)
     chassert(cgroupsV2Enabled());
-    /// According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines
-    /// which controllers are enabled for the current cgroup. Check the bottom-most nested "cgroup.controllers"
-    /// file.
+    /// According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines which controllers are available
+    /// for the current + child cgroups. The set of available controllers can be restricted from level to level using file
+    /// "cgroups.subtree_control". It is therefore sufficient to check the bottom-most nested "cgroup.controllers" file.
     std::string cgroup = cgroupV2OfProcess();
     auto cgroup_dir = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup);
     std::ifstream controllers_file(cgroup_dir / "cgroup.controllers");
@@ -33,9 +33,7 @@ bool cgroupsV2MemoryControllerEnabled()
         return false;
     std::string controllers;
     std::getline(controllers_file, controllers);
-    if (controllers.find("memory") != std::string::npos)
-        return true;
-    return false;
+    return controllers.find("memory") != std::string::npos;
 #else
     return false;
 #endif

From 15eac2513ba4d716618256d85488c7ae4f1dd2b1 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 2 May 2024 10:09:17 +0000
Subject: [PATCH 187/192] Docs: Update install instructions

---
 docs/en/getting-started/install.md | 56 ++++++++++++++++++------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index e3be30bde8c..d028301f08e 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -11,7 +11,7 @@ import CodeBlock from '@theme/CodeBlock';
 
 # Install ClickHouse
 
-You have three options for getting up and running with ClickHouse:
+You have four options for getting up and running with ClickHouse:
 
 - **[ClickHouse Cloud](https://clickhouse.com/cloud/):** The official ClickHouse as a service, - built by, maintained and supported by the creators of ClickHouse
 - **[Quick Install](#quick-install):** an easy-to-download binary for testing and developing with ClickHouse
@@ -32,37 +32,49 @@ On Linux, macOS and FreeBSD:
 
 1. If you are just getting started and want to see what ClickHouse can do, the simplest way to download ClickHouse locally is to run the
    following command. It downloads a single binary for your operating system that can be used to run the ClickHouse server,
-   clickhouse-client, clickhouse-local, ClickHouse Keeper, and other tools:
+   `clickhouse-client`, `clickhouse-local`, ClickHouse Keeper, and other tools:
 
-  ```bash
-  curl https://clickhouse.com/ | sh
-  ```
+   ```bash
+   curl https://clickhouse.com/ | sh
+   ```
 
-1. Run the following command to start the ClickHouse server:
+2. Run the following command to start [clickhouse-local](../operations/utilities/clickhouse-local.md):
+
+   ```bash
+   ./clickhouse
+   ```
+
+   `clickhouse-local` allows you to process local and remote files using ClickHouse's powerful SQL and without a need for configuration. Table
+   data is stored in a temporary location, meaning that after a restart of `clickhouse-local` all previously created tables are no longer
+   available.
+
+   As an alternative, you can start the ClickHouse server with this command ...
 
     ```bash
     ./clickhouse server
     ```
 
-   The first time you run this script, the necessary files and folders are created in the current directory, then the server starts.
+   ... and open a new terminal and use `clickhouse-client` to connect to the server:
 
-1. Open a new terminal and use the **./clickhouse client** to connect to your service:
+    ```bash
+    ./clickhouse client
+    ```
 
-  ```bash
-  ./clickhouse client
-  ```
+    ```response
+    ./clickhouse client
+    ClickHouse client version 24.5.1.117 (official build).
+    Connecting to localhost:9000 as user default.
+    Connected to ClickHouse server version 24.5.1.
 
-  ```response
-  ./clickhouse client
-  ClickHouse client version 23.2.1.1501 (official build).
-  Connecting to localhost:9000 as user default.
-  Connected to ClickHouse server version 23.2.1.
+    local-host :)
+    ```
 
-  local-host :)
-  ```
-
-  You are ready to start sending DDL and SQL commands to ClickHouse!
+   Table data is stored in the current directory and still available after a restart of ClickHouse server. If necessary, you can pass
+   `-C config.xml` as an additional command line argument to `./clickhouse server` and provide further configuration in a configuration
+   file. All availabe configuration settings are documented [here](../operations/settings.md) and in an [example configuration file
+   template](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.xml).
 
+   You are ready to start sending SQL commands to ClickHouse!
 
 :::tip
 The [Quick Start](/docs/en/quick-start.mdx) walks through the steps for creating tables and inserting data.
@@ -377,14 +389,14 @@ build.
 
 ### macOS-only: Install with Homebrew
 
-To install ClickHouse using [homebrew](https://brew.sh/), see [here](https://formulae.brew.sh/cask/clickhouse).
+To install ClickHouse on macOS using [homebrew](https://brew.sh/), please see the ClickHouse [community homebrew formula](https://formulae.brew.sh/cask/clickhouse).
 
 ## Launch {#launch}
 
 To start the server as a daemon, run:
 
 ``` bash
-$ sudo clickhouse start
+$ clickhouse start
 ```
 
 There are also other ways to run ClickHouse:

From 81e4a754d951df5753b88d819f22ec1f21217a81 Mon Sep 17 00:00:00 2001
From: Jordi Villar <jrdi.villar@gmail.com>
Date: Thu, 2 May 2024 12:51:53 +0200
Subject: [PATCH 188/192] Document start of week differences

---
 .../functions/date-time-functions.md            | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 2cbcc193dd6..ed98f986139 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -1413,9 +1413,10 @@ toStartOfFifteenMinutes(toDateTime('2023-04-21 10:20:00')): 2023-04-21 10:15:00
 toStartOfFifteenMinutes(toDateTime('2023-04-21 10:23:00')): 2023-04-21 10:15:00
 ```
 
-## toStartOfInterval(date_or_date_with_time, INTERVAL x unit \[, time_zone\])
+## toStartOfInterval
 
-This function generalizes other `toStartOf*()` functions. For example,
+This function generalizes other `toStartOf*()` functions with `toStartOfInterval(date_or_date_with_time, INTERVAL x unit [, time_zone])` syntax.
+For example,
 - `toStartOfInterval(t, INTERVAL 1 year)` returns the same as `toStartOfYear(t)`,
 - `toStartOfInterval(t, INTERVAL 1 month)` returns the same as `toStartOfMonth(t)`,
 - `toStartOfInterval(t, INTERVAL 1 day)` returns the same as `toStartOfDay(t)`,
@@ -1440,6 +1441,8 @@ The calculation is performed relative to specific points in time:
 (*) hour intervals are special: the calculation is always performed relative to 00:00:00 (midnight) of the current day. As a result, only
     hour values between 1 and 23 are useful.
 
+If unit `week` was specified, `toStartOfInterval` assumes that weeks start on Monday. Note that this behavior is different from that of function `toStartOfWeek` in which weeks start by default on Sunday.
+
 **See Also**
 
 - [date_trunc](#date_trunc)
@@ -1673,7 +1676,7 @@ Like [fromDaysSinceYearZero](#fromDaysSinceYearZero) but returns a [Date32](../.
 Returns the `unit` component of the difference between `startdate` and `enddate`. The difference is calculated using a precision of 1 nanosecond.
 E.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for `day` unit, 0 months for `month` unit, 0 years for `year` unit.
 
-For an alternative to `age`, see function `date\_diff`.
+For an alternative to `age`, see function `date_diff`.
 
 **Syntax**
 
@@ -1742,14 +1745,14 @@ Result:
 ```
 
 
-## date\_diff
+## date_diff
 
 Returns the count of the specified `unit` boundaries crossed between the `startdate` and the `enddate`.
 The difference is calculated using relative units, e.g. the difference between `2021-12-29` and `2022-01-01` is 3 days for unit `day` (see [toRelativeDayNum](#torelativedaynum)), 1 month for unit `month` (see [toRelativeMonthNum](#torelativemonthnum)) and 1 year for unit `year` (see [toRelativeYearNum](#torelativeyearnum)).
 
-If unit `week` was specified, `date\_diff` assumes that weeks start on Monday. Note that this behavior is different from that of function `toWeek()` in which weeks start by default on Sunday.
+If unit `week` was specified, `date_diff` assumes that weeks start on Monday. Note that this behavior is different from that of function `toWeek()` in which weeks start by default on Sunday.
 
-For an alternative to `date\_diff`, see function `age`.
+For an alternative to `date_diff`, see function `age`.
 
 **Syntax**
 
@@ -2883,7 +2886,7 @@ Result:
 
 ## fromUnixTimestamp
 
-This function converts a Unix timestamp to a calendar date and a time of a day. 
+This function converts a Unix timestamp to a calendar date and a time of a day.
 
 It can be called in two ways:
 

From d2a3ffbe57c5c2b06800878e0c3ce57572cee317 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 2 May 2024 11:08:17 +0000
Subject: [PATCH 189/192] Typo

---
 docs/en/getting-started/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index d028301f08e..dc4d73cd13b 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -71,7 +71,7 @@ On Linux, macOS and FreeBSD:
 
    Table data is stored in the current directory and still available after a restart of ClickHouse server. If necessary, you can pass
    `-C config.xml` as an additional command line argument to `./clickhouse server` and provide further configuration in a configuration
-   file. All availabe configuration settings are documented [here](../operations/settings.md) and in an [example configuration file
+   file. All available configuration settings are documented [here](../operations/settings.md) and in an [example configuration file
    template](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.xml).
 
    You are ready to start sending SQL commands to ClickHouse!

From 083bcfc1cef06f700e0ca027ac3ee49a575116c7 Mon Sep 17 00:00:00 2001
From: Sema Checherinda <Sema.Checherinda@clickhouse.com>
Date: Thu, 2 May 2024 13:07:35 +0200
Subject: [PATCH 190/192] distinct message for s3 error 'no key' for cases disk
 and storage

---
 src/Backups/BackupIO_AzureBlobStorage.cpp     | 11 +--
 .../IO/ReadBufferFromAzureBlobStorage.cpp     |  4 +-
 .../AzureBlobStorage/AzureBlobStorageAuth.cpp |  3 +
 .../AzureBlobStorage/AzureObjectStorage.cpp   | 22 +++---
 src/Disks/ObjectStorages/IObjectStorage.cpp   |  8 +--
 .../ObjectStorages/S3/S3ObjectStorage.cpp     | 21 +++---
 .../copyAzureBlobStorageFile.cpp              | 17 ++---
 .../copyAzureBlobStorageFile.h                |  6 +-
 src/IO/ReadBufferFromS3.cpp                   |  4 +-
 src/IO/ReadSettings.h                         |  3 -
 src/IO/S3/Client.cpp                          | 27 ++++++--
 src/IO/S3/Client.h                            |  8 +++
 src/IO/S3/copyS3File.cpp                      | 14 ++--
 src/IO/S3/getObjectInfo.cpp                   | 22 +++---
 src/IO/S3/getObjectInfo.h                     |  6 +-
 src/IO/WriteBufferFromS3.cpp                  | 14 ++--
 src/IO/WriteSettings.h                        |  3 -
 src/Storages/StorageAzureBlob.h               |  2 -
 src/Storages/StorageS3.cpp                    |  1 -
 .../test_checking_s3_blobs_paranoid/test.py   | 67 +++++++++++++++++++
 20 files changed, 171 insertions(+), 92 deletions(-)

diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp
index 9cf5c433826..a3998431674 100644
--- a/src/Backups/BackupIO_AzureBlobStorage.cpp
+++ b/src/Backups/BackupIO_AzureBlobStorage.cpp
@@ -14,6 +14,7 @@
 #include <Disks/DiskType.h>
 
 #include <Poco/Util/AbstractConfiguration.h>
+#include <azure/storage/blobs/blob_options.hpp>
 
 #include <filesystem>
 
@@ -38,6 +39,8 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage(
     , configuration(configuration_)
 {
     auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false);
+    client_ptr->SetClickhouseOptions(Azure::Storage::Blobs::ClickhouseClientOptions{.IsClientForDisk=true});
+
     object_storage = std::make_unique<AzureObjectStorage>("BackupReaderAzureBlobStorage",
                                                           std::move(client_ptr),
                                                           StorageAzureBlob::createSettings(context_),
@@ -97,8 +100,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup,
                 /* dest_path */ blob_path[0],
                 settings,
                 read_settings,
-                threadPoolCallbackRunnerUnsafe<void>(getBackupsIOThreadPool().get(), "BackupRDAzure"),
-                /* for_disk_azure_blob_storage= */ true);
+                threadPoolCallbackRunnerUnsafe<void>(getBackupsIOThreadPool().get(), "BackupRDAzure"));
 
             return file_size;
         };
@@ -123,6 +125,8 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage(
     , configuration(configuration_)
 {
     auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false, attempt_to_create_container);
+    client_ptr->SetClickhouseOptions(Azure::Storage::Blobs::ClickhouseClientOptions{.IsClientForDisk=true});
+
     object_storage = std::make_unique<AzureObjectStorage>("BackupWriterAzureBlobStorage",
                                                           std::move(client_ptr),
                                                           StorageAzureBlob::createSettings(context_),
@@ -177,8 +181,7 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St
        /* dest_path */ destination,
        settings,
        read_settings,
-       threadPoolCallbackRunnerUnsafe<void>(getBackupsIOThreadPool().get(), "BackupWRAzure"),
-       /* for_disk_azure_blob_storage= */ true);
+       threadPoolCallbackRunnerUnsafe<void>(getBackupsIOThreadPool().get(), "BackupWRAzure"));
 }
 
 void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length)
diff --git a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
index 48b4ed23af0..da1ea65f2ea 100644
--- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
+++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
@@ -225,7 +225,7 @@ void ReadBufferFromAzureBlobStorage::initialize()
         try
         {
             ProfileEvents::increment(ProfileEvents::AzureGetObject);
-            if (read_settings.for_object_storage)
+            if (blob_container_client->GetClickhouseOptions().IsClientForDisk)
                 ProfileEvents::increment(ProfileEvents::DiskAzureGetObject);
 
             auto download_response = blob_client->Download(download_options);
@@ -279,7 +279,7 @@ size_t ReadBufferFromAzureBlobStorage::readBigAt(char * to, size_t n, size_t ran
         try
         {
             ProfileEvents::increment(ProfileEvents::AzureGetObject);
-            if (read_settings.for_object_storage)
+            if (blob_container_client->GetClickhouseOptions().IsClientForDisk)
                 ProfileEvents::increment(ProfileEvents::DiskAzureGetObject);
 
             Azure::Storage::Blobs::DownloadBlobOptions download_options;
diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp
index cf84fe46579..43bbb5cad4b 100644
--- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp
+++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp
@@ -5,6 +5,7 @@
 #include <Common/Exception.h>
 #include <Common/re2.h>
 #include <azure/identity/managed_identity_credential.hpp>
+#include <azure/storage/blobs/blob_options.hpp>
 #include <azure/core/http/curl_transport.hpp>
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Interpreters/Context.h>
@@ -206,6 +207,8 @@ Azure::Storage::Blobs::BlobClientOptions getAzureBlobClientOptions(const Poco::U
     client_options.Retry = retry_options;
     client_options.Transport.Transport = std::make_shared<Azure::Core::Http::CurlTransport>(curl_options);
 
+    client_options.ClickhouseOptions = Azure::Storage::Blobs::ClickhouseClientOptions{.IsClientForDisk=true};
+
     return client_options;
 }
 
diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
index 0f45f40288e..38a7db5702c 100644
--- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
@@ -69,7 +69,8 @@ private:
     bool getBatchAndCheckNext(RelativePathsWithMetadata & batch) override
     {
         ProfileEvents::increment(ProfileEvents::AzureListObjects);
-        ProfileEvents::increment(ProfileEvents::DiskAzureListObjects);
+        if (client->GetClickhouseOptions().IsClientForDisk)
+            ProfileEvents::increment(ProfileEvents::DiskAzureListObjects);
 
         batch.clear();
         auto outcome = client->ListBlobs(options);
@@ -130,7 +131,8 @@ bool AzureObjectStorage::exists(const StoredObject & object) const
     options.PageSizeHint = 1;
 
     ProfileEvents::increment(ProfileEvents::AzureListObjects);
-    ProfileEvents::increment(ProfileEvents::DiskAzureListObjects);
+    if (client_ptr->GetClickhouseOptions().IsClientForDisk)
+        ProfileEvents::increment(ProfileEvents::DiskAzureListObjects);
 
     auto blobs_list_response = client_ptr->ListBlobs(options);
     auto blobs_list = blobs_list_response.Blobs;
@@ -169,7 +171,8 @@ void AzureObjectStorage::listObjects(const std::string & path, RelativePathsWith
     while (true)
     {
         ProfileEvents::increment(ProfileEvents::AzureListObjects);
-        ProfileEvents::increment(ProfileEvents::DiskAzureListObjects);
+        if (client_ptr->GetClickhouseOptions().IsClientForDisk)
+            ProfileEvents::increment(ProfileEvents::DiskAzureListObjects);
 
         blob_list_response = client_ptr->ListBlobs(options);
         auto blobs_list = blob_list_response.Blobs;
@@ -298,7 +301,8 @@ std::unique_ptr<WriteBufferFromFileBase> AzureObjectStorage::writeObject( /// NO
 void AzureObjectStorage::removeObjectImpl(const StoredObject & object, const SharedAzureClientPtr & client_ptr, bool if_exists)
 {
     ProfileEvents::increment(ProfileEvents::AzureDeleteObjects);
-    ProfileEvents::increment(ProfileEvents::DiskAzureDeleteObjects);
+    if (client_ptr->GetClickhouseOptions().IsClientForDisk)
+        ProfileEvents::increment(ProfileEvents::DiskAzureDeleteObjects);
 
     const auto & path = object.remote_path;
     LOG_TEST(log, "Removing single object: {}", path);
@@ -353,13 +357,14 @@ void AzureObjectStorage::removeObjectsIfExist(const StoredObjects & objects)
 
 ObjectMetadata AzureObjectStorage::getObjectMetadata(const std::string & path) const
 {
-    ProfileEvents::increment(ProfileEvents::AzureGetProperties);
-    ProfileEvents::increment(ProfileEvents::DiskAzureGetProperties);
-
     auto client_ptr = client.get();
     auto blob_client = client_ptr->GetBlobClient(path);
     auto properties = blob_client.GetProperties().Value;
 
+    ProfileEvents::increment(ProfileEvents::AzureGetProperties);
+    if (client_ptr->GetClickhouseOptions().IsClientForDisk)
+        ProfileEvents::increment(ProfileEvents::DiskAzureGetProperties);
+
     ObjectMetadata result;
     result.size_bytes = properties.BlobSize;
     if (!properties.Metadata.empty())
@@ -391,7 +396,8 @@ void AzureObjectStorage::copyObject( /// NOLINT
     }
 
     ProfileEvents::increment(ProfileEvents::AzureCopyObject);
-    ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject);
+    if (client_ptr->GetClickhouseOptions().IsClientForDisk)
+        ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject);
 
     dest_blob_client.CopyFromUri(source_blob_client.GetUrl(), copy_options);
 }
diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp
index 5fd9695ec9e..accef9a08ab 100644
--- a/src/Disks/ObjectStorages/IObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/IObjectStorage.cpp
@@ -84,16 +84,12 @@ const std::string & IObjectStorage::getCacheName() const
 
 ReadSettings IObjectStorage::patchSettings(const ReadSettings & read_settings) const
 {
-    ReadSettings settings{read_settings};
-    settings.for_object_storage = true;
-    return settings;
+    return read_settings;
 }
 
 WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings) const
 {
-    WriteSettings settings{write_settings};
-    settings.for_object_storage = true;
-    return settings;
+    return write_settings;
 }
 
 }
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
index 2f0d93907ae..2eae8877f87 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@@ -158,7 +158,7 @@ private:
 bool S3ObjectStorage::exists(const StoredObject & object) const
 {
     auto settings_ptr = s3_settings.get();
-    return S3::objectExists(*client.get(), uri.bucket, object.remote_path, {}, settings_ptr->request_settings, /* for_disk_s3= */ true);
+    return S3::objectExists(*client.get(), uri.bucket, object.remote_path, {}, settings_ptr->request_settings);
 }
 
 std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
@@ -425,7 +425,7 @@ void S3ObjectStorage::removeObjectsIfExist(const StoredObjects & objects)
 std::optional<ObjectMetadata> S3ObjectStorage::tryGetObjectMetadata(const std::string & path) const
 {
     auto settings_ptr = s3_settings.get();
-    auto object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true, /* for_disk_s3= */ true, /* throw_on_error= */ false);
+    auto object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true, /* throw_on_error= */ false);
 
     if (object_info.size == 0 && object_info.last_modification_time == 0 && object_info.metadata.empty())
         return {};
@@ -441,7 +441,7 @@ std::optional<ObjectMetadata> S3ObjectStorage::tryGetObjectMetadata(const std::s
 ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) const
 {
     auto settings_ptr = s3_settings.get();
-    auto object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true, /* for_disk_s3= */ true);
+    auto object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true);
 
     ObjectMetadata result;
     result.size_bytes = object_info.size;
@@ -464,9 +464,11 @@ void S3ObjectStorage::copyObjectToAnotherObjectStorage( // NOLINT
     {
         auto current_client = dest_s3->client.get();
         auto settings_ptr = s3_settings.get();
-        auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings, /* for_disk_s3= */ true);
+        auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings);
         auto scheduler = threadPoolCallbackRunnerUnsafe<void>(getThreadPoolWriter(), "S3ObjStor_copy");
-        try {
+
+        try
+        {
             copyS3File(
                 current_client,
                 uri.bucket,
@@ -479,8 +481,7 @@ void S3ObjectStorage::copyObjectToAnotherObjectStorage( // NOLINT
                 patchSettings(read_settings),
                 BlobStorageLogWriter::create(disk_name),
                 object_to_attributes,
-                scheduler,
-                /* for_disk_s3= */ true);
+                scheduler);
             return;
         }
         catch (S3Exception & exc)
@@ -506,8 +507,9 @@ void S3ObjectStorage::copyObject( // NOLINT
 {
     auto current_client = client.get();
     auto settings_ptr = s3_settings.get();
-    auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings, /* for_disk_s3= */ true);
+    auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings);
     auto scheduler = threadPoolCallbackRunnerUnsafe<void>(getThreadPoolWriter(), "S3ObjStor_copy");
+
     copyS3File(current_client,
         uri.bucket,
         object_from.remote_path,
@@ -519,8 +521,7 @@ void S3ObjectStorage::copyObject( // NOLINT
         patchSettings(read_settings),
         BlobStorageLogWriter::create(disk_name),
         object_to_attributes,
-        scheduler,
-        /* for_disk_s3= */ true);
+        scheduler);
 }
 
 void S3ObjectStorage::setNewSettings(std::unique_ptr<S3ObjectStorageSettings> && s3_settings_)
diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp
index ef8c01f4b5e..769f1a184f6 100644
--- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp
+++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp
@@ -46,7 +46,6 @@ namespace
             const String & dest_blob_,
             std::shared_ptr<const AzureObjectStorageSettings> settings_,
             ThreadPoolCallbackRunnerUnsafe<void> schedule_,
-            bool for_disk_azure_blob_storage_,
             const Poco::Logger * log_)
             : create_read_buffer(create_read_buffer_)
             , client(client_)
@@ -56,7 +55,6 @@ namespace
             , dest_blob(dest_blob_)
             , settings(settings_)
             , schedule(schedule_)
-            , for_disk_azure_blob_storage(for_disk_azure_blob_storage_)
             , log(log_)
             , max_single_part_upload_size(settings_->max_single_part_upload_size)
         {
@@ -73,7 +71,6 @@ namespace
         const String & dest_blob;
         std::shared_ptr<const AzureObjectStorageSettings> settings;
         ThreadPoolCallbackRunnerUnsafe<void> schedule;
-        bool for_disk_azure_blob_storage;
         const Poco::Logger * log;
         size_t max_single_part_upload_size;
 
@@ -217,7 +214,7 @@ namespace
         void processUploadPartRequest(UploadPartTask & task)
         {
             ProfileEvents::increment(ProfileEvents::AzureUploadPart);
-            if (for_disk_azure_blob_storage)
+            if (client->GetClickhouseOptions().IsClientForDisk)
                 ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart);
 
             auto block_blob_client = client->GetBlockBlobClient(dest_blob);
@@ -269,10 +266,9 @@ void copyDataToAzureBlobStorageFile(
     const String & dest_container_for_logging,
     const String & dest_blob,
     std::shared_ptr<const AzureObjectStorageSettings> settings,
-    ThreadPoolCallbackRunnerUnsafe<void> schedule,
-    bool for_disk_azure_blob_storage)
+    ThreadPoolCallbackRunnerUnsafe<void> schedule)
 {
-    UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")};
+    UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, &Poco::Logger::get("copyDataToAzureBlobStorageFile")};
     helper.performCopy();
 }
 
@@ -288,14 +284,13 @@ void copyAzureBlobStorageFile(
     const String & dest_blob,
     std::shared_ptr<const AzureObjectStorageSettings> settings,
     const ReadSettings & read_settings,
-    ThreadPoolCallbackRunnerUnsafe<void> schedule,
-    bool for_disk_azure_blob_storage)
+    ThreadPoolCallbackRunnerUnsafe<void> schedule)
 {
 
     if (settings->use_native_copy)
     {
         ProfileEvents::increment(ProfileEvents::AzureCopyObject);
-        if (for_disk_azure_blob_storage)
+        if (dest_client->GetClickhouseOptions().IsClientForDisk)
             ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject);
 
         auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob);
@@ -330,7 +325,7 @@ void copyAzureBlobStorageFile(
             settings->max_single_download_retries);
         };
 
-        UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")};
+        UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, &Poco::Logger::get("copyAzureBlobStorageFile")};
         helper.performCopy();
     }
 }
diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h
index 170a3d7f6aa..6ad54923ab5 100644
--- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h
+++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h
@@ -31,8 +31,7 @@ void copyAzureBlobStorageFile(
     const String & dest_blob,
     std::shared_ptr<const AzureObjectStorageSettings> settings,
     const ReadSettings & read_settings,
-    ThreadPoolCallbackRunnerUnsafe<void> schedule_ = {},
-    bool for_disk_azure_blob_storage = false);
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_ = {});
 
 
 /// Copies data from any seekable source to AzureBlobStorage.
@@ -48,8 +47,7 @@ void copyDataToAzureBlobStorageFile(
     const String & dest_container_for_logging,
     const String & dest_blob,
     std::shared_ptr<const AzureObjectStorageSettings> settings,
-    ThreadPoolCallbackRunnerUnsafe<void> schedule_ = {},
-    bool for_disk_azure_blob_storage = false);
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_ = {});
 
 }
 
diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp
index 491ff253066..813546aa052 100644
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@@ -314,7 +314,7 @@ size_t ReadBufferFromS3::getFileSize()
     if (file_size)
         return *file_size;
 
-    auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, request_settings, /* for_disk_s3= */ read_settings.for_object_storage);
+    auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, request_settings);
 
     file_size = object_size;
     return *file_size;
@@ -415,7 +415,7 @@ Aws::S3::Model::GetObjectResult ReadBufferFromS3::sendRequest(size_t attempt, si
     }
 
     ProfileEvents::increment(ProfileEvents::S3GetObject);
-    if (read_settings.for_object_storage)
+    if (client_ptr->isClientForDisk())
         ProfileEvents::increment(ProfileEvents::DiskS3GetObject);
 
     ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::ReadBufferFromS3InitMicroseconds);
diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h
index 6a0cac35878..6c44861eae3 100644
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@@ -127,9 +127,6 @@ struct ReadSettings
     bool http_skip_not_found_url_for_globs = true;
     bool http_make_head_request = true;
 
-    /// Monitoring
-    bool for_object_storage = false; // to choose which profile events should be incremented
-
     ReadSettings adjustBufferSize(size_t file_size) const
     {
         ReadSettings res = *this;
diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp
index 6c138eb3bff..9229342b8c1 100644
--- a/src/IO/S3/Client.cpp
+++ b/src/IO/S3/Client.cpp
@@ -384,7 +384,8 @@ Model::HeadObjectOutcome Client::HeadObject(HeadObjectRequest & request) const
 
     /// The next call is NOT a recurcive call
     /// This is a virtuall call Aws::S3::S3Client::HeadObject(const Model::HeadObjectRequest&)
-    return HeadObject(static_cast<const Model::HeadObjectRequest&>(request));
+    return enrichErrorMessage(
+        HeadObject(static_cast<const Model::HeadObjectRequest&>(request)));
 }
 
 /// For each request, we wrap the request functions from Aws::S3::Client with doRequest
@@ -404,7 +405,8 @@ Model::ListObjectsOutcome Client::ListObjects(ListObjectsRequest & request) cons
 
 Model::GetObjectOutcome Client::GetObject(GetObjectRequest & request) const
 {
-    return doRequest(request, [this](const Model::GetObjectRequest & req) { return GetObject(req); });
+    return enrichErrorMessage(
+        doRequest(request, [this](const Model::GetObjectRequest & req) { return GetObject(req); }));
 }
 
 Model::AbortMultipartUploadOutcome Client::AbortMultipartUpload(AbortMultipartUploadRequest & request) const
@@ -652,14 +654,14 @@ Client::doRequestWithRetryNetworkErrors(RequestType & request, RequestFn request
 
                 if constexpr (IsReadMethod)
                 {
-                    if (client_configuration.for_disk_s3)
+                    if (isClientForDisk())
                         ProfileEvents::increment(ProfileEvents::DiskS3ReadRequestsErrors);
                     else
                         ProfileEvents::increment(ProfileEvents::S3ReadRequestsErrors);
                 }
                 else
                 {
-                    if (client_configuration.for_disk_s3)
+                    if (isClientForDisk())
                         ProfileEvents::increment(ProfileEvents::DiskS3WriteRequestsErrors);
                     else
                         ProfileEvents::increment(ProfileEvents::S3WriteRequestsErrors);
@@ -689,6 +691,23 @@ Client::doRequestWithRetryNetworkErrors(RequestType & request, RequestFn request
     return doRequest(request, with_retries);
 }
 
+template <typename RequestResult>
+RequestResult Client::enrichErrorMessage(RequestResult && outcome) const
+{
+    if (outcome.IsSuccess() || !isClientForDisk())
+        return std::forward<RequestResult>(outcome);
+
+    String enriched_message = fmt::format(
+        "{} {}",
+        outcome.GetError().GetMessage(),
+        "This error happened for S3 disk.");
+
+    auto error = outcome.GetError();
+    error.SetMessage(enriched_message);
+
+    return RequestResult(error);
+}
+
 bool Client::supportsMultiPartCopy() const
 {
     return provider_type != ProviderType::GCS;
diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h
index c79ec05c8c6..bd281846343 100644
--- a/src/IO/S3/Client.h
+++ b/src/IO/S3/Client.h
@@ -214,6 +214,11 @@ public:
 
     bool isS3ExpressBucket() const { return client_settings.is_s3express_bucket; }
 
+    bool isClientForDisk() const
+    {
+        return client_configuration.for_disk_s3;
+    }
+
 private:
     friend struct ::MockS3::Client;
 
@@ -265,6 +270,9 @@ private:
     bool checkIfWrongRegionDefined(const std::string & bucket, const Aws::S3::S3Error & error, std::string & region) const;
     void insertRegionOverride(const std::string & bucket, const std::string & region) const;
 
+    template <typename RequestResult>
+    RequestResult enrichErrorMessage(RequestResult && outcome) const;
+
     String initial_endpoint;
     std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
     PocoHTTPClientConfiguration client_configuration;
diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp
index 3b1f25ed994..549d0a569c6 100644
--- a/src/IO/S3/copyS3File.cpp
+++ b/src/IO/S3/copyS3File.cpp
@@ -140,7 +140,7 @@ namespace
             fillCreateMultipartRequest(request);
 
             ProfileEvents::increment(ProfileEvents::S3CreateMultipartUpload);
-            if (for_disk_s3)
+            if (client_ptr->isClientForDisk())
                 ProfileEvents::increment(ProfileEvents::DiskS3CreateMultipartUpload);
 
             auto outcome = client_ptr->CreateMultipartUpload(request);
@@ -189,7 +189,7 @@ namespace
             for (size_t retries = 1;; ++retries)
             {
                 ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload);
-                if (for_disk_s3)
+                if (client_ptr->isClientForDisk())
                     ProfileEvents::increment(ProfileEvents::DiskS3CompleteMultipartUpload);
 
                 auto outcome = client_ptr->CompleteMultipartUpload(request);
@@ -239,7 +239,7 @@ namespace
         void checkObjectAfterUpload()
         {
             LOG_TRACE(log, "Checking object {} exists after upload", dest_key);
-            S3::checkObjectExists(*client_ptr, dest_bucket, dest_key, {}, request_settings, {}, "Immediately after upload");
+            S3::checkObjectExists(*client_ptr, dest_bucket, dest_key, {}, request_settings, "Immediately after upload");
             LOG_TRACE(log, "Object {} exists after upload", dest_key);
         }
 
@@ -528,7 +528,7 @@ namespace
             for (size_t retries = 1;; ++retries)
             {
                 ProfileEvents::increment(ProfileEvents::S3PutObject);
-                if (for_disk_s3)
+                if (client_ptr->isClientForDisk())
                     ProfileEvents::increment(ProfileEvents::DiskS3PutObject);
 
                 Stopwatch watch;
@@ -615,7 +615,7 @@ namespace
             auto & req = typeid_cast<S3::UploadPartRequest &>(request);
 
             ProfileEvents::increment(ProfileEvents::S3UploadPart);
-            if (for_disk_s3)
+            if (client_ptr->isClientForDisk())
                 ProfileEvents::increment(ProfileEvents::DiskS3UploadPart);
 
             auto outcome = client_ptr->UploadPart(req);
@@ -726,7 +726,7 @@ namespace
             for (size_t retries = 1;; ++retries)
             {
                 ProfileEvents::increment(ProfileEvents::S3CopyObject);
-                if (for_disk_s3)
+                if (client_ptr->isClientForDisk())
                     ProfileEvents::increment(ProfileEvents::DiskS3CopyObject);
 
                 auto outcome = client_ptr->CopyObject(request);
@@ -830,7 +830,7 @@ namespace
             auto & req = typeid_cast<S3::UploadPartCopyRequest &>(request);
 
             ProfileEvents::increment(ProfileEvents::S3UploadPartCopy);
-            if (for_disk_s3)
+            if (client_ptr->isClientForDisk())
                 ProfileEvents::increment(ProfileEvents::DiskS3UploadPartCopy);
 
             auto outcome = client_ptr->UploadPartCopy(req);
diff --git a/src/IO/S3/getObjectInfo.cpp b/src/IO/S3/getObjectInfo.cpp
index 88f79f8d8d5..eee3da9fb74 100644
--- a/src/IO/S3/getObjectInfo.cpp
+++ b/src/IO/S3/getObjectInfo.cpp
@@ -25,10 +25,10 @@ namespace DB::S3
 namespace
 {
     Aws::S3::Model::HeadObjectOutcome headObject(
-        const S3::Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
+        const S3::Client & client, const String & bucket, const String & key, const String & version_id)
     {
         ProfileEvents::increment(ProfileEvents::S3HeadObject);
-        if (for_disk_s3)
+        if (client.isClientForDisk())
             ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
 
         S3::HeadObjectRequest req;
@@ -44,9 +44,9 @@ namespace
     /// Performs a request to get the size and last modification time of an object.
     std::pair<std::optional<ObjectInfo>, Aws::S3::S3Error> tryGetObjectInfo(
         const S3::Client & client, const String & bucket, const String & key, const String & version_id,
-        const S3Settings::RequestSettings & /*request_settings*/, bool with_metadata, bool for_disk_s3)
+        const S3Settings::RequestSettings & /*request_settings*/, bool with_metadata)
     {
-        auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
+        auto outcome = headObject(client, bucket, key, version_id);
         if (!outcome.IsSuccess())
             return {std::nullopt, outcome.GetError()};
 
@@ -75,10 +75,9 @@ ObjectInfo getObjectInfo(
     const String & version_id,
     const S3Settings::RequestSettings & request_settings,
     bool with_metadata,
-    bool for_disk_s3,
     bool throw_on_error)
 {
-    auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, with_metadata, for_disk_s3);
+    auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, with_metadata);
     if (object_info)
     {
         return *object_info;
@@ -98,10 +97,9 @@ size_t getObjectSize(
     const String & key,
     const String & version_id,
     const S3Settings::RequestSettings & request_settings,
-    bool for_disk_s3,
     bool throw_on_error)
 {
-    return getObjectInfo(client, bucket, key, version_id, request_settings, {}, for_disk_s3, throw_on_error).size;
+    return getObjectInfo(client, bucket, key, version_id, request_settings, {}, throw_on_error).size;
 }
 
 bool objectExists(
@@ -109,10 +107,9 @@ bool objectExists(
     const String & bucket,
     const String & key,
     const String & version_id,
-    const S3Settings::RequestSettings & request_settings,
-    bool for_disk_s3)
+    const S3Settings::RequestSettings & request_settings)
 {
-    auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}, for_disk_s3);
+    auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {});
     if (object_info)
         return true;
 
@@ -130,10 +127,9 @@ void checkObjectExists(
     const String & key,
     const String & version_id,
     const S3Settings::RequestSettings & request_settings,
-    bool for_disk_s3,
     std::string_view description)
 {
-    auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}, for_disk_s3);
+    auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {});
     if (object_info)
         return;
     throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}",
diff --git a/src/IO/S3/getObjectInfo.h b/src/IO/S3/getObjectInfo.h
index a57d807644b..ac8072a4338 100644
--- a/src/IO/S3/getObjectInfo.h
+++ b/src/IO/S3/getObjectInfo.h
@@ -26,7 +26,6 @@ ObjectInfo getObjectInfo(
     const String & version_id = {},
     const S3Settings::RequestSettings & request_settings = {},
     bool with_metadata = false,
-    bool for_disk_s3 = false,
     bool throw_on_error = true);
 
 size_t getObjectSize(
@@ -35,7 +34,6 @@ size_t getObjectSize(
     const String & key,
     const String & version_id = {},
     const S3Settings::RequestSettings & request_settings = {},
-    bool for_disk_s3 = false,
     bool throw_on_error = true);
 
 bool objectExists(
@@ -43,8 +41,7 @@ bool objectExists(
     const String & bucket,
     const String & key,
     const String & version_id = {},
-    const S3Settings::RequestSettings & request_settings = {},
-    bool for_disk_s3 = false);
+    const S3Settings::RequestSettings & request_settings = {});
 
 /// Throws an exception if a specified object doesn't exist. `description` is used as a part of the error message.
 void checkObjectExists(
@@ -53,7 +50,6 @@ void checkObjectExists(
     const String & key,
     const String & version_id = {},
     const S3Settings::RequestSettings & request_settings = {},
-    bool for_disk_s3 = false,
     std::string_view description = {});
 
 bool isNotFoundError(Aws::S3::S3Errors error);
diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index e41867ce225..3ea372f75d8 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -214,9 +214,9 @@ void WriteBufferFromS3::finalizeImpl()
 
     if (request_settings.check_objects_after_upload)
     {
-        S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload");
+        S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, "Immediately after upload");
 
-        size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings, /* for_disk_s3= */ write_settings.for_object_storage);
+        size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings);
         if (actual_size != total_size)
             throw Exception(
                     ErrorCodes::S3_ERROR,
@@ -390,7 +390,7 @@ void WriteBufferFromS3::createMultipartUpload()
     client_ptr->setKMSHeaders(req);
 
     ProfileEvents::increment(ProfileEvents::S3CreateMultipartUpload);
-    if (write_settings.for_object_storage)
+    if (client_ptr->isClientForDisk())
         ProfileEvents::increment(ProfileEvents::DiskS3CreateMultipartUpload);
 
     Stopwatch watch;
@@ -429,7 +429,7 @@ void WriteBufferFromS3::abortMultipartUpload()
     req.SetUploadId(multipart_upload_id);
 
     ProfileEvents::increment(ProfileEvents::S3AbortMultipartUpload);
-    if (write_settings.for_object_storage)
+    if (client_ptr->isClientForDisk())
         ProfileEvents::increment(ProfileEvents::DiskS3AbortMultipartUpload);
 
     Stopwatch watch;
@@ -530,7 +530,7 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data)
                  getShortLogDetails(), data_size, part_number);
 
         ProfileEvents::increment(ProfileEvents::S3UploadPart);
-        if (write_settings.for_object_storage)
+        if (client_ptr->isClientForDisk())
             ProfileEvents::increment(ProfileEvents::DiskS3UploadPart);
 
         auto & request = std::get<0>(*worker_data);
@@ -606,7 +606,7 @@ void WriteBufferFromS3::completeMultipartUpload()
     for (size_t i = 0; i < max_retry; ++i)
     {
         ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload);
-        if (write_settings.for_object_storage)
+        if (client_ptr->isClientForDisk())
             ProfileEvents::increment(ProfileEvents::DiskS3CompleteMultipartUpload);
 
         Stopwatch watch;
@@ -689,7 +689,7 @@ void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data
         for (size_t i = 0; i < max_retry; ++i)
         {
             ProfileEvents::increment(ProfileEvents::S3PutObject);
-            if (write_settings.for_object_storage)
+            if (client_ptr->isClientForDisk())
                 ProfileEvents::increment(ProfileEvents::DiskS3PutObject);
 
             ResourceCost cost = request.GetContentLength();
diff --git a/src/IO/WriteSettings.h b/src/IO/WriteSettings.h
index 4da3806e51d..84bb25439b5 100644
--- a/src/IO/WriteSettings.h
+++ b/src/IO/WriteSettings.h
@@ -25,9 +25,6 @@ struct WriteSettings
     bool s3_allow_parallel_part_upload = true;
     bool azure_allow_parallel_part_upload = true;
 
-    /// Monitoring
-    bool for_object_storage = false; // to choose which profile events should be incremented
-
     bool operator==(const WriteSettings & other) const = default;
 };
 
diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h
index 5b0d8802657..20e7f4a6c90 100644
--- a/src/Storages/StorageAzureBlob.h
+++ b/src/Storages/StorageAzureBlob.h
@@ -33,8 +33,6 @@ public:
 
         bool update(const ContextPtr & context);
 
-        void connect(const ContextPtr & context);
-
         bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; }
 
         bool withWildcard() const
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index c2039c2dd79..3fe0b66a453 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -1866,7 +1866,6 @@ namespace
                              configuration.url.version_id,
                              configuration.request_settings,
                              /*with_metadata=*/ false,
-                             /*for_disk_s3=*/ false,
                              /*throw_on_error= */ false).last_modification_time;
                     }
 
diff --git a/tests/integration/test_checking_s3_blobs_paranoid/test.py b/tests/integration/test_checking_s3_blobs_paranoid/test.py
index 1391f1af6f1..22d6d263d23 100644
--- a/tests/integration/test_checking_s3_blobs_paranoid/test.py
+++ b/tests/integration/test_checking_s3_blobs_paranoid/test.py
@@ -2,6 +2,8 @@
 
 import logging
 import pytest
+import os
+import minio
 
 from helpers.cluster import ClickHouseCluster
 from helpers.mock_servers import start_s3_mock
@@ -608,3 +610,68 @@ def test_adaptive_timeouts(cluster, broken_s3, node_name):
     else:
         assert s3_use_adaptive_timeouts == "0"
         assert s3_errors == 0
+
+
+def test_no_key_found_disk(cluster, broken_s3):
+    node = cluster.instances["node"]
+
+    node.query(
+        """
+        CREATE TABLE no_key_found_disk (
+            id Int64
+        ) ENGINE=MergeTree()
+        ORDER BY id
+        SETTINGS
+            storage_policy='s3'
+        """
+    )
+
+    uuid = node.query(
+        """
+        SELECT uuid
+        FROM system.tables
+        WHERE name = 'no_key_found_disk'
+        """
+    ).strip()
+    assert uuid
+
+    node.query("INSERT INTO no_key_found_disk VALUES (1)")
+
+    data = node.query("SELECT * FROM no_key_found_disk").strip()
+
+    assert data == "1"
+
+    remote_pathes = (
+        node.query(
+            f"""
+        SELECT remote_path
+        FROM system.remote_data_paths
+        WHERE
+            local_path LIKE '%{uuid}%'
+            AND local_path LIKE '%.bin%'
+        ORDER BY ALL
+        """
+        )
+        .strip()
+        .split()
+    )
+
+    assert len(remote_pathes) > 0
+
+    # path_prefix = os.path.join('/', cluster.minio_bucket)
+    for path in remote_pathes:
+        # name = os.path.relpath(path, path_prefix)
+        # assert False, f"deleting full {path} prefix {path_prefix} name {name}"
+        assert cluster.minio_client.stat_object(cluster.minio_bucket, path).size > 0
+        cluster.minio_client.remove_object(cluster.minio_bucket, path)
+        with pytest.raises(Exception) as exc_info:
+            size = cluster.minio_client.stat_object(cluster.minio_bucket, path).size
+            assert size == 0
+        assert "code: NoSuchKey" in str(exc_info.value)
+
+    error = node.query_and_get_error("SELECT * FROM no_key_found_disk").strip()
+
+    assert (
+        "DB::Exception: The specified key does not exist. This error happened for S3 disk."
+        in error
+    )

From 2c34e4b65051df15e93a74c07e03d48e585587ce Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 2 May 2024 14:42:58 +0200
Subject: [PATCH 191/192] Fix links

---
 docs/en/getting-started/install.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index dc4d73cd13b..6525c29306a 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -45,7 +45,7 @@ On Linux, macOS and FreeBSD:
    ```
 
    `clickhouse-local` allows you to process local and remote files using ClickHouse's powerful SQL and without a need for configuration. Table
-   data is stored in a temporary location, meaning that after a restart of `clickhouse-local` all previously created tables are no longer
+   data is stored in a temporary location, meaning that after a restart of `clickhouse-local` previously created tables are no longer
    available.
 
    As an alternative, you can start the ClickHouse server with this command ...
@@ -54,7 +54,7 @@ On Linux, macOS and FreeBSD:
     ./clickhouse server
     ```
 
-   ... and open a new terminal and use `clickhouse-client` to connect to the server:
+   ... and open a new terminal to connect to the server with `clickhouse-client`:
 
     ```bash
     ./clickhouse client
@@ -71,7 +71,7 @@ On Linux, macOS and FreeBSD:
 
    Table data is stored in the current directory and still available after a restart of ClickHouse server. If necessary, you can pass
    `-C config.xml` as an additional command line argument to `./clickhouse server` and provide further configuration in a configuration
-   file. All available configuration settings are documented [here](../operations/settings.md) and in an [example configuration file
+   file. All available configuration settings are documented [here](../operations/settings/settings.md) and in an [example configuration file
    template](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config.xml).
 
    You are ready to start sending SQL commands to ClickHouse!

From 8f7ccdffcb8e8344e88b7ec941011e7ca9bc93e3 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Thu, 2 May 2024 13:49:25 +0000
Subject: [PATCH 192/192] Docs: Add workaround for UUID sorting

---
 docs/en/sql-reference/data-types/uuid.md | 43 +++++++++++++++++-------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/docs/en/sql-reference/data-types/uuid.md b/docs/en/sql-reference/data-types/uuid.md
index 59d7b0d7549..175c82f6c09 100644
--- a/docs/en/sql-reference/data-types/uuid.md
+++ b/docs/en/sql-reference/data-types/uuid.md
@@ -23,8 +23,8 @@ The default UUID is all-zero. It is used, for example, when a new record is inse
 00000000-0000-0000-0000-000000000000
 ```
 
-Due to historical reasons, UUIDs are sorted by their second half (which is unintuitive).
-UUIDs should therefore not be used in an primary key (or sorting key) of a table, or as partition key.
+Due to historical reasons, UUIDs are sorted by their second half.
+UUIDs should therefore not be used directly in a primary key, sorting key, or partition key of a table.
 
 Example:
 
@@ -43,18 +43,7 @@ Result:
 │ 3fda7c49-282e-421a-85ab-c5684ef1d350 │
 │ 16ab55a7-45f6-44a8-873c-7a0b44346b3e │
 │ e3776711-6359-4f22-878d-bf290d052c85 │
-│ 1be30226-57b2-4739-88ec-5e3d490090f2 │
-│ f65853a9-4375-4f0e-8b96-906ff622ed3c │
-│ d5a0c7a6-79c6-4107-8bb8-df85915edcb7 │
-│ 258e6068-17d1-4a1a-8be3-ed2ceb21815c │
-│ 04b0f6a9-1f7b-4a42-8bfc-62f37b8a32b8 │
-│ 9924f0d9-9c16-43a9-8f08-0944ab495aed │
-│ 6720dc14-4eab-4e3e-8f0c-10c4ae8d2673 │
-│ 5ddadb52-0452-4f5d-9030-c3f969af93a4 │
 │                [...]                 │
-│ 2dde30e6-59a1-48f8-b260-eb37921185b6 │
-│ d5402a1b-77b3-4897-b288-29edf5c3ed12 │
-│ 01843939-3ba7-4fea-b2aa-45f9a6f1e057 │
 │ 9eceda2f-6946-40e3-b725-16f2709ca41a │
 │ 03644f74-47ba-4020-b865-be5fd4c8c7ff │
 │ ce3bc93d-ab19-4c74-b8cc-737cb9212099 │
@@ -63,6 +52,34 @@ Result:
 └──────────────────────────────────────┘
 ```
 
+As a workaround, the UUID can be converted to a type with an intuitive sort order.
+
+Example using conversion to UInt128:
+
+``` sql
+CREATE TABLE tab (uuid UUID) ENGINE = Memory;
+INSERT INTO tab SELECT generateUUIDv4() FROM numbers(50);
+SELECT * FROM tab ORDER BY toUInt128(uuid);
+```
+
+Result:
+
+```sql
+┌─uuid─────────────────────────────────┐
+│ 018b81cd-aca1-4e9c-9e56-a84a074dc1a8 │
+│ 02380033-c96a-438e-913f-a2c67e341def │
+│ 057cf435-7044-456a-893b-9183a4475cea │
+│ 0a3c1d4c-f57d-44cc-8567-60cb0c46f76e │
+│ 0c15bf1c-8633-4414-a084-7017eead9e41 │
+│                [...]                 │
+│ f808cf05-ea57-4e81-8add-29a195bde63d │
+│ f859fb5d-764b-4a33-81e6-9e4239dae083 │
+│ fb1b7e37-ab7b-421a-910b-80e60e2bf9eb │
+│ fc3174ff-517b-49b5-bfe2-9b369a5c506d │
+│ fece9bf6-3832-449a-b058-cd1d70a02c8b │
+└──────────────────────────────────────┘
+```
+
 ## Generating UUIDs
 
 ClickHouse provides the [generateUUIDv4](../../sql-reference/functions/uuid-functions.md) function to generate random UUID version 4 values.