From 0dbbca5b3e6ff8959604bfb3c69b097261c147b1 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Wed, 11 May 2022 19:33:45 +0000
Subject: [PATCH 01/42] fix filling of missed Nested columns with multiple
 levels

---
 .../Serializations/SerializationArray.cpp     | 38 +++++-----
 .../Serializations/SerializationArray.h       |  1 +
 src/Interpreters/inplaceBlockConversions.cpp  | 73 ++++++++++++++-----
 src/Interpreters/inplaceBlockConversions.h    |  1 +
 src/Storages/MergeTree/IMergeTreeReader.cpp   |  6 +-
 src/Storages/StorageMemory.cpp                |  2 +-
 .../0_stateless/01825_type_json_17.reference  |  7 ++
 .../0_stateless/01825_type_json_17.sql        | 18 +++++
 8 files changed, 108 insertions(+), 38 deletions(-)
 create mode 100644 tests/queries/0_stateless/01825_type_json_17.reference
 create mode 100644 tests/queries/0_stateless/01825_type_json_17.sql
diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp
index 30ee5e98b74..aebfb1b27b2 100644
--- a/src/DataTypes/Serializations/SerializationArray.cpp
+++ b/src/DataTypes/Serializations/SerializationArray.cpp
@@ -132,29 +132,29 @@ namespace
 
         offset_values.resize(i);
     }
+}
 
-    ColumnPtr arraySizesToOffsets(const IColumn & column)
-    {
-        const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column);
-        MutableColumnPtr column_offsets = column_sizes.cloneEmpty();
-
-        if (column_sizes.empty())
-            return column_offsets;
-
-        const auto & sizes_data = column_sizes.getData();
-        auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData();
-
-        offsets_data.resize(sizes_data.size());
-
-        IColumn::Offset prev_offset = 0;
-        for (size_t i = 0, size = sizes_data.size(); i < size; ++i)
-        {
-            prev_offset += sizes_data[i];
-            offsets_data[i] = prev_offset;
-        }
+ColumnPtr arraySizesToOffsets(const IColumn & column)
+{
+    const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column);
+    MutableColumnPtr column_offsets = column_sizes.cloneEmpty();
 
+    if (column_sizes.empty())
         return column_offsets;
+
+    const auto & sizes_data = column_sizes.getData();
+    auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData();
+
+    offsets_data.resize(sizes_data.size());
+
+    IColumn::Offset prev_offset = 0;
+    for (size_t i = 0, size = sizes_data.size(); i < size; ++i)
+    {
+        prev_offset += sizes_data[i];
+        offsets_data[i] = prev_offset;
     }
+
+    return column_offsets;
 }
 
 ColumnPtr arrayOffsetsToSizes(const IColumn & column)
diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h
index 3769f8a4513..9179988bf10 100644
--- a/src/DataTypes/Serializations/SerializationArray.h
+++ b/src/DataTypes/Serializations/SerializationArray.h
@@ -80,5 +80,6 @@ private:
 };
 
 ColumnPtr arrayOffsetsToSizes(const IColumn & column);
+ColumnPtr arraySizesToOffsets(const IColumn & column);
 
 }
diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp
index 1bde6fe5a8c..bc40d64de16 100644
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@@ -12,6 +12,7 @@
 #include <Parsers/ASTFunction.h>
 #include <utility>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/ObjectUtils.h>
 #include <Interpreters/RequiredSourceColumnsVisitor.h>
 #include <Common/checkStackSize.h>
 #include <Storages/ColumnsDescription.h>
@@ -198,6 +199,9 @@ static bool arrayHasNoElementsRead(const IColumn & column)
     if (!size)
         return false;
 
+    if (const auto * nested_array = typeid_cast<const ColumnArray *>(&column_array->getData()))
+        return arrayHasNoElementsRead(*nested_array);
+
     size_t data_size = column_array->getData().size();
     if (data_size)
         return false;
@@ -210,6 +214,7 @@ void fillMissingColumns(
     Columns & res_columns,
     size_t num_rows,
     const NamesAndTypesList & requested_columns,
+    const NamesAndTypesList & available_columns,
     StorageMetadataPtr metadata_snapshot)
 {
     size_t num_columns = requested_columns.size();
@@ -224,26 +229,35 @@ void fillMissingColumns(
     /// First, collect offset columns for all arrays in the block.
 
     std::unordered_map<String, ColumnPtr> offset_columns;
-    auto requested_column = requested_columns.begin();
-    for (size_t i = 0; i < num_columns; ++i, ++requested_column)
+    auto available_column = available_columns.begin();
+    for (size_t i = 0; i < num_columns; ++i, ++available_column)
     {
         if (res_columns[i] == nullptr)
             continue;
 
-        if (const auto * array = typeid_cast<const ColumnArray *>(res_columns[i].get()))
+        auto serialization = IDataType::getSerialization(*available_column);
+        auto name_in_storage = Nested::extractTableName(available_column->name);
+
+        ISerialization::SubstreamPath path;
+        serialization->enumerateStreams(path, [&](const auto & subpath)
         {
-            String offsets_name = Nested::extractTableName(requested_column->name);
-            auto & offsets_column = offset_columns[offsets_name];
+            if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                return;
+
+            auto subname = ISerialization::getSubcolumnNameForStream(subpath);
+            auto & offsets_column = offset_columns[Nested::concatenateName(name_in_storage, subname)];
 
             /// If for some reason multiple offsets columns are present for the same nested data structure,
             /// choose the one that is not empty.
+            /// TODO: more optimal
             if (!offsets_column || offsets_column->empty())
-                offsets_column = array->getOffsetsPtr();
-        }
+                offsets_column = arraySizesToOffsets(*subpath.back().data.column);
+
+        }, {serialization, available_column->type, res_columns[i], nullptr});
     }
 
     /// insert default values only for columns without default expressions
-    requested_column = requested_columns.begin();
+    auto requested_column = requested_columns.begin();
     for (size_t i = 0; i < num_columns; ++i, ++requested_column)
     {
         const auto & [name, type] = *requested_column;
@@ -256,19 +270,44 @@ void fillMissingColumns(
             if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
                 continue;
 
-            String offsets_name = Nested::extractTableName(name);
-            auto offset_it = offset_columns.find(offsets_name);
+            std::vector<ColumnPtr> current_offsets;
+            bool has_all_offsets = true;
+
             const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
-            if (offset_it != offset_columns.end() && array_type)
+            if (array_type)
             {
-                const auto & nested_type = array_type->getNestedType();
-                ColumnPtr offsets_column = offset_it->second;
-                size_t nested_rows = typeid_cast<const ColumnUInt64 &>(*offsets_column).getData().back();
+                auto serialization = IDataType::getSerialization(*requested_column);
+                auto name_in_storage = Nested::extractTableName(requested_column->name);
 
-                ColumnPtr nested_column =
-                    nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst();
+                ISerialization::SubstreamPath path;
+                serialization->enumerateStreams(path, [&](const auto & subpath)
+                {
+                    if (!has_all_offsets)
+                        return;
 
-                res_columns[i] = ColumnArray::create(nested_column, offsets_column);
+                    if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                        return;
+
+                    auto subname = ISerialization::getSubcolumnNameForStream(subpath);
+                    auto it = offset_columns.find(Nested::concatenateName(name_in_storage, subname));
+                    if (it != offset_columns.end())
+                        current_offsets.emplace_back(it->second);
+                    else
+                        has_all_offsets = false;
+
+                }, {serialization, type, nullptr, nullptr});
+            }
+
+            if (array_type && has_all_offsets)
+            {
+                assert(!current_offsets.empty());
+                auto scalar_type = getBaseTypeOfArray(type);
+
+                size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
+                res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst();
+
+                for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
+                    res_columns[i] = ColumnArray::create(res_columns[i], *it);
             }
             else
             {
diff --git a/src/Interpreters/inplaceBlockConversions.h b/src/Interpreters/inplaceBlockConversions.h
index b3113ddfa5c..70187d5aace 100644
--- a/src/Interpreters/inplaceBlockConversions.h
+++ b/src/Interpreters/inplaceBlockConversions.h
@@ -43,6 +43,7 @@ void fillMissingColumns(
     Columns & res_columns,
     size_t num_rows,
     const NamesAndTypesList & requested_columns,
+    const NamesAndTypesList & available_columns,
     StorageMetadataPtr metadata_snapshot);
 
 }
diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp
index 3a823345dda..4eff1653d1e 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@@ -66,7 +66,11 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
 {
     try
     {
-        DB::fillMissingColumns(res_columns, num_rows, columns, metadata_snapshot);
+        NamesAndTypesList available_columns;
+        for (const auto & column : columns)
+            available_columns.push_back(getColumnFromPart(column));
+
+        DB::fillMissingColumns(res_columns, num_rows, columns, available_columns, metadata_snapshot);
         should_evaluate_missing_defaults = std::any_of(
             res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; });
     }
diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index e7911125383..20ca84452e7 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -91,7 +91,7 @@ protected:
             ++name_and_type;
         }
 
-        fillMissingColumns(columns, src.rows(), column_names_and_types, /*metadata_snapshot=*/ nullptr);
+        fillMissingColumns(columns, src.rows(), column_names_and_types, column_names_and_types, /*metadata_snapshot=*/ nullptr);
         assert(std::all_of(columns.begin(), columns.end(), [](const auto & column) { return column != nullptr; }));
 
         return Chunk(std::move(columns), src.rows());
diff --git a/tests/queries/0_stateless/01825_type_json_17.reference b/tests/queries/0_stateless/01825_type_json_17.reference
new file mode 100644
index 00000000000..96e58224f32
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_17.reference
@@ -0,0 +1,7 @@
+Tuple(arr Nested(k1 Nested(k2 String, k3 String, k4 Int8), k5 Tuple(k6 String)), id Int8)
+{"obj":{"arr":[{"k1":[{"k2":"aaa","k3":"bbb","k4":0},{"k2":"ccc","k3":"","k4":0}],"k5":{"k6":""}}],"id":1}}
+{"obj":{"arr":[{"k1":[{"k2":"","k3":"ddd","k4":10},{"k2":"","k3":"","k4":20}],"k5":{"k6":"foo"}}],"id":2}}
+[['bbb','']]	[['aaa','ccc']]
+[['ddd','']]	[['','']]
+[[0,0]]
+[[10,20]]
diff --git a/tests/queries/0_stateless/01825_type_json_17.sql b/tests/queries/0_stateless/01825_type_json_17.sql
new file mode 100644
index 00000000000..b34357f8ef1
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_17.sql
@@ -0,0 +1,18 @@
+-- Tags: no-fasttest
+
+DROP TABLE IF EXISTS t_json_17;
+SET allow_experimental_object_type = 1;
+SET output_format_json_named_tuples_as_objects = 1;
+
+CREATE TABLE t_json_17(obj JSON)
+ENGINE = MergeTree ORDER BY tuple();
+
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k3": "ddd", "k4": 10}, {"k4": 20}], "k5": {"k6": "foo"}}]}
+
+SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
+SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
+SELECT obj.arr.k1.k3, obj.arr.k1.k2 FROM t_json_17 ORDER BY obj.id;
+SELECT obj.arr.k1.k4 FROM t_json_17 ORDER BY obj.id;
+
+DROP TABLE IF EXISTS t_json_17;

From 56fffc86810773e0399de5147b08ced18b120822 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Thu, 12 May 2022 17:21:30 +0000
Subject: [PATCH 02/42] fix filling of missing columns

---
 src/Interpreters/inplaceBlockConversions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp
index bc40d64de16..61d51da4eab 100644
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@@ -232,7 +232,7 @@ void fillMissingColumns(
     auto available_column = available_columns.begin();
     for (size_t i = 0; i < num_columns; ++i, ++available_column)
     {
-        if (res_columns[i] == nullptr)
+        if (res_columns[i] == nullptr || isColumnConst(*res_columns[i]))
             continue;
 
         auto serialization = IDataType::getSerialization(*available_column);

From 1523c9c9e5f714daff08d7811890effb25e4a87b Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Fri, 17 Jun 2022 01:10:52 +0000
Subject: [PATCH 03/42] fix filling of empty Nested + small refactoring

---
 src/Columns/ColumnArray.cpp                   |   6 +-
 .../CompressionFactoryAdditions.cpp           |   4 +-
 src/DataTypes/IDataType.cpp                   |  29 +++--
 src/DataTypes/IDataType.h                     |   2 +
 .../Serializations/ISerialization.cpp         |  35 ++---
 src/DataTypes/Serializations/ISerialization.h |  44 ++++++-
 .../Serializations/SerializationArray.cpp     | 123 +++++++++---------
 .../Serializations/SerializationArray.h       |   5 +-
 .../SerializationLowCardinality.cpp           |  27 ++--
 .../SerializationLowCardinality.h             |   2 +-
 .../Serializations/SerializationMap.cpp       |  15 +--
 .../Serializations/SerializationMap.h         |   2 +-
 .../Serializations/SerializationNamed.cpp     |  12 +-
 .../Serializations/SerializationNamed.h       |   2 +-
 .../Serializations/SerializationNullable.cpp  |  40 +++---
 .../Serializations/SerializationNullable.h    |   2 +-
 .../Serializations/SerializationSparse.cpp    |  39 +++---
 .../Serializations/SerializationSparse.h      |   2 +-
 .../Serializations/SerializationTuple.cpp     |  15 +--
 .../Serializations/SerializationTuple.h       |   2 +-
 .../Serializations/SerializationWrapper.cpp   |   4 +-
 .../Serializations/SerializationWrapper.h     |   2 +-
 src/Interpreters/InterpreterDescribeQuery.cpp |   2 +-
 src/Interpreters/inplaceBlockConversions.cpp  |  25 ++--
 src/Storages/ColumnsDescription.cpp           |   2 +-
 .../MergeTreeDataPartWriterCompact.cpp        |   3 +-
 .../MergeTree/MergeTreeDataPartWriterWide.cpp |  30 ++---
 .../MergeTree/MergeTreeDataPartWriterWide.h   |   9 +-
 .../MergeTree/MergeTreeReaderCompact.cpp      |  23 ++--
 .../MergeTree/MergeTreeReaderWide.cpp         |   2 +-
 .../System/StorageSystemPartsColumns.cpp      |   4 +-
 31 files changed, 264 insertions(+), 250 deletions(-)

diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp
index 24da9644335..7bdb66a9cc5 100644
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@@ -50,13 +50,15 @@ ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr &&
     if (!offsets_concrete)
         throw Exception("offsets_column must be a ColumnUInt64", ErrorCodes::LOGICAL_ERROR);
 
-    if (!offsets_concrete->empty() && data)
+    if (!offsets_concrete->empty() && data && !data->empty())
     {
         Offset last_offset = offsets_concrete->getData().back();
 
         /// This will also prevent possible overflow in offset.
         if (data->size() != last_offset)
-            throw Exception("offsets_column has data inconsistent with nested_column", ErrorCodes::LOGICAL_ERROR);
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                "offsets_column has data ({}) inconsistent with nested_column ({})",
+                data->size(), last_offset);
     }
 
     /** NOTE
diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp
index d87d0f8b4ee..3e215076871 100644
--- a/src/Compression/CompressionFactoryAdditions.cpp
+++ b/src/Compression/CompressionFactoryAdditions.cpp
@@ -116,8 +116,8 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
                         }
                     };
 
-                    ISerialization::SubstreamPath path;
-                    column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type);
+                    auto serialization = column_type->getDefaultSerialization();
+                    serialization->enumerateStreams(callback, column_type);
 
                     if (!result_codec)
                         throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());
diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp
index f2bb878a533..8229946cac8 100644
--- a/src/DataTypes/IDataType.cpp
+++ b/src/DataTypes/IDataType.cpp
@@ -84,18 +84,20 @@ void IDataType::forEachSubcolumn(
     {
         for (size_t i = 0; i < subpath.size(); ++i)
         {
-            if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1))
+            size_t prefix_len = i + 1;
+            if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len))
             {
-                auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1);
-                auto subdata = ISerialization::createFromPath(subpath, i);
+                auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len);
+                auto subdata = ISerialization::createFromPath(subpath, prefix_len);
                 callback(subpath, name, subdata);
             }
             subpath[i].visited = true;
         }
     };
 
-    SubstreamPath path;
-    data.serialization->enumerateStreams(path, callback_with_data, data);
+    ISerialization::EnumerateStreamsSettings settings;
+    settings.position_independent_encoding = false;
+    data.serialization->enumerateStreams(settings, callback_with_data, data);
 }
 
 template <typename Ptr>
@@ -118,33 +120,38 @@ Ptr IDataType::getForSubcolumn(
     return res;
 }
 
+bool IDataType::hasSubcolumn(const String & subcolumn_name) const
+{
+    return tryGetSubcolumnType(subcolumn_name) != nullptr;
+}
+
 DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
 {
-    SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
     return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
 }
 
 DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
 {
-    SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
     return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, true);
 }
 
 ColumnPtr IDataType::tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
 {
-    SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
     return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, false);
 }
 
 ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
 {
-    SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
     return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, true);
 }
 
 SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
 {
-    SubstreamData data = { serialization, nullptr, nullptr, nullptr };
+    auto data = SubstreamData(serialization);
     return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization, true);
 }
 
@@ -154,7 +161,7 @@ Names IDataType::getSubcolumnNames() const
     forEachSubcolumn([&](const auto &, const auto & name, const auto &)
     {
         res.push_back(name);
-    }, { getDefaultSerialization(), nullptr, nullptr, nullptr });
+    }, SubstreamData(getDefaultSerialization()));
     return res;
 }
 
diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h
index 420ef61a13f..9161fd7dc7b 100644
--- a/src/DataTypes/IDataType.h
+++ b/src/DataTypes/IDataType.h
@@ -79,6 +79,8 @@ public:
     /// Data type id. It's used for runtime type checks.
     virtual TypeIndex getTypeId() const = 0;
 
+    bool hasSubcolumn(const String & subcolumn_name) const;
+
     DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const;
     DataTypePtr getSubcolumnType(const String & subcolumn_name) const;
 
diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp
index 7df4a956c1a..b9b3b9ac6af 100644
--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@@ -73,24 +73,24 @@ String ISerialization::SubstreamPath::toString() const
 }
 
 void ISerialization::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    path.push_back(Substream::Regular);
-    path.back().data = data;
-    callback(path);
-    path.pop_back();
+    settings.path.push_back(Substream::Regular);
+    settings.path.back().data = data;
+    callback(settings.path);
+    settings.path.pop_back();
 }
 
-void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
+void ISerialization::enumerateStreams(
+    const StreamCallback & callback,
+    const DataTypePtr & type,
+    const ColumnPtr & column) const
 {
-    enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr});
-}
-
-void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const
-{
-    enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr});
+    EnumerateStreamsSettings settings;
+    auto data = SubstreamData(getPtr()).withType(type).withColumn(column);
+    enumerateStreams(settings, callback, data);
 }
 
 void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const
@@ -184,7 +184,7 @@ String ISerialization::getFileNameForStream(const NameAndTypePair & column, cons
     return getFileNameForStream(column.getNameInStorage(), path);
 }
 
-static size_t isOffsetsOfNested(const ISerialization::SubstreamPath & path)
+bool isOffsetsOfNested(const ISerialization::SubstreamPath & path)
 {
     if (path.empty())
         return false;
@@ -288,10 +288,13 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref
 
 ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)
 {
-    assert(prefix_len < path.size());
+    assert(prefix_len <= path.size());
+    if (prefix_len == 0)
+        return {};
 
-    SubstreamData res = path[prefix_len].data;
-    for (ssize_t i = static_cast<ssize_t>(prefix_len) - 1; i >= 0; --i)
+    ssize_t last_elem = prefix_len - 1;
+    auto res = path[last_elem].data;
+    for (ssize_t i = last_elem - 1; i >= 0; --i)
     {
         const auto & creator = path[i].creator;
         if (creator)
diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h
index b5d2082631e..1193c15b939 100644
--- a/src/DataTypes/Serializations/ISerialization.h
+++ b/src/DataTypes/Serializations/ISerialization.h
@@ -101,6 +101,30 @@ public:
 
     struct SubstreamData
     {
+        SubstreamData() = default;
+        SubstreamData(SerializationPtr serialization_)
+            : serialization(std::move(serialization_))
+        {
+        }
+
+        SubstreamData & withType(DataTypePtr type_)
+        {
+            type = std::move(type_);
+            return *this;
+        }
+
+        SubstreamData & withColumn(ColumnPtr column_)
+        {
+            column = std::move(column_);
+            return *this;
+        }
+
+        SubstreamData & withSerializationInfo(SerializationInfoPtr serialization_info_)
+        {
+            serialization_info = std::move(serialization_info_);
+            return *this;
+        }
+
         SerializationPtr serialization;
         DataTypePtr type;
         ColumnPtr column;
@@ -164,16 +188,22 @@ public:
 
     using StreamCallback = std::function<void(const SubstreamPath &)>;
 
+    struct EnumerateStreamsSettings
+    {
+        SubstreamPath path;
+        bool position_independent_encoding = true;
+    };
+
     virtual void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const;
 
-    void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
-    void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
-    void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
-
-    void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const;
+    /// Enumerate streams with default settings.
+    void enumerateStreams(
+        const StreamCallback & callback,
+        const DataTypePtr & type = nullptr,
+        const ColumnPtr & column = nullptr) const;
 
     using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
     using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
@@ -375,4 +405,6 @@ State * ISerialization::checkAndGetState(const StatePtr & state) const
     return state_concrete;
 }
 
+bool isOffsetsOfNested(const ISerialization::SubstreamPath & path);
+
 }
diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp
index aebfb1b27b2..85182c3a111 100644
--- a/src/DataTypes/Serializations/SerializationArray.cpp
+++ b/src/DataTypes/Serializations/SerializationArray.cpp
@@ -132,53 +132,53 @@ namespace
 
         offset_values.resize(i);
     }
-}
 
-ColumnPtr arraySizesToOffsets(const IColumn & column)
-{
-    const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column);
-    MutableColumnPtr column_offsets = column_sizes.cloneEmpty();
+    ColumnPtr arraySizesToOffsets(const IColumn & column)
+    {
+        const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column);
+        MutableColumnPtr column_offsets = column_sizes.cloneEmpty();
+
+        if (column_sizes.empty())
+            return column_offsets;
+
+        const auto & sizes_data = column_sizes.getData();
+        auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData();
+
+        offsets_data.resize(sizes_data.size());
+
+        IColumn::Offset prev_offset = 0;
+        for (size_t i = 0, size = sizes_data.size(); i < size; ++i)
+        {
+            prev_offset += sizes_data[i];
+            offsets_data[i] = prev_offset;
+        }
 
-    if (column_sizes.empty())
         return column_offsets;
-
-    const auto & sizes_data = column_sizes.getData();
-    auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData();
-
-    offsets_data.resize(sizes_data.size());
-
-    IColumn::Offset prev_offset = 0;
-    for (size_t i = 0, size = sizes_data.size(); i < size; ++i)
-    {
-        prev_offset += sizes_data[i];
-        offsets_data[i] = prev_offset;
     }
 
-    return column_offsets;
-}
+    ColumnPtr arrayOffsetsToSizes(const IColumn & column)
+    {
+        const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column);
+        MutableColumnPtr column_sizes = column_offsets.cloneEmpty();
 
-ColumnPtr arrayOffsetsToSizes(const IColumn & column)
-{
-    const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column);
-    MutableColumnPtr column_sizes = column_offsets.cloneEmpty();
+        if (column_offsets.empty())
+            return column_sizes;
+
+        const auto & offsets_data = column_offsets.getData();
+        auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData();
+
+        sizes_data.resize(offsets_data.size());
+
+        IColumn::Offset prev_offset = 0;
+        for (size_t i = 0, size = offsets_data.size(); i < size; ++i)
+        {
+            auto current_offset = offsets_data[i];
+            sizes_data[i] = current_offset - prev_offset;
+            prev_offset =  current_offset;
+        }
 
-    if (column_offsets.empty())
         return column_sizes;
-
-    const auto & offsets_data = column_offsets.getData();
-    auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData();
-
-    sizes_data.resize(offsets_data.size());
-
-    IColumn::Offset prev_offset = 0;
-    for (size_t i = 0, size = offsets_data.size(); i < size; ++i)
-    {
-        auto current_offset = offsets_data[i];
-        sizes_data[i] = current_offset - prev_offset;
-        prev_offset =  current_offset;
     }
-
-    return column_sizes;
 }
 
 DataTypePtr SerializationArray::SubcolumnCreator::create(const DataTypePtr & prev) const
@@ -197,41 +197,42 @@ ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) c
 }
 
 void SerializationArray::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * type_array = data.type ? &assert_cast<const DataTypeArray &>(*data.type) : nullptr;
     const auto * column_array = data.column ? &assert_cast<const ColumnArray &>(*data.column) : nullptr;
-    auto offsets_column = column_array ? column_array->getOffsetsPtr() : nullptr;
+    auto offsets = column_array ? column_array->getOffsetsPtr() : nullptr;
 
-    path.push_back(Substream::ArraySizes);
-    path.back().data =
-    {
+    auto offsets_serialization =
         std::make_shared<SerializationNamed>(
             std::make_shared<SerializationNumber<UInt64>>(),
-                "size" + std::to_string(getArrayLevel(path)), false),
-        data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
-        offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
-        data.serialization_info,
-    };
+                "size" + std::to_string(getArrayLevel(settings.path)), false);
 
-    callback(path);
+    auto offsets_column = offsets && !settings.position_independent_encoding
+        ? arrayOffsetsToSizes(*offsets)
+        : offsets;
 
-    path.back() = Substream::ArrayElements;
-    path.back().data = data;
-    path.back().creator = std::make_shared<SubcolumnCreator>(offsets_column);
+    settings.path.push_back(Substream::ArraySizes);
+    settings.path.back().data = SubstreamData(offsets_serialization)
+        .withType(type_array ? std::make_shared<DataTypeUInt64>() : nullptr)
+        .withColumn(std::move(offsets_column))
+        .withSerializationInfo(data.serialization_info);
 
-    SubstreamData next_data =
-    {
-        nested,
-        type_array ? type_array->getNestedType() : nullptr,
-        column_array ? column_array->getDataPtr() : nullptr,
-        data.serialization_info,
-    };
+    callback(settings.path);
 
-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    settings.path.back() = Substream::ArrayElements;
+    settings.path.back().data = data;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets);
+
+    auto next_data = SubstreamData(nested)
+        .withType(type_array ? type_array->getNestedType() : nullptr)
+        .withColumn(column_array ? column_array->getDataPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
+
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }
 
 void SerializationArray::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h
index 9179988bf10..84e37acbaad 100644
--- a/src/DataTypes/Serializations/SerializationArray.h
+++ b/src/DataTypes/Serializations/SerializationArray.h
@@ -36,7 +36,7 @@ public:
       */
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
@@ -79,7 +79,4 @@ private:
     };
 };
 
-ColumnPtr arrayOffsetsToSizes(const IColumn & column);
-ColumnPtr arraySizesToOffsets(const IColumn & column);
-
 }
diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
index c79f588e46c..2473a74327d 100644
--- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
@@ -41,30 +41,25 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic
 }
 
 void SerializationLowCardinality::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr;
 
-    SubstreamData dict_data =
-    {
-        dict_inner_serialization,
-        data.type ? dictionary_type : nullptr,
-        column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.push_back(Substream::DictionaryKeys);
+    settings.path.back().data = SubstreamData(dict_inner_serialization)
+        .withType(data.type ? dictionary_type : nullptr)
+        .withColumn(column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    path.push_back(Substream::DictionaryKeys);
-    path.back().data = dict_data;
+    dict_inner_serialization->enumerateStreams(settings, callback, settings.path.back().data);
 
-    dict_inner_serialization->enumerateStreams(path, callback, dict_data);
+    settings.path.back() = Substream::DictionaryIndexes;
+    settings.path.back().data = data;
 
-    path.back() = Substream::DictionaryIndexes;
-    path.back().data = data;
-
-    callback(path);
-    path.pop_back();
+    callback(settings.path);
+    settings.path.pop_back();
 }
 
 struct KeysSerializationVersion
diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h
index 0a3597e86c7..860920a2422 100644
--- a/src/DataTypes/Serializations/SerializationLowCardinality.h
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.h
@@ -18,7 +18,7 @@ public:
     explicit SerializationLowCardinality(const DataTypePtr & dictionary_type);
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp
index ea22070b5b1..e46bb480d14 100644
--- a/src/DataTypes/Serializations/SerializationMap.cpp
+++ b/src/DataTypes/Serializations/SerializationMap.cpp
@@ -257,19 +257,16 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c
 }
 
 void SerializationMap::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    SubstreamData next_data =
-    {
-        nested,
-        data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr,
-        data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto next_data = SubstreamData(nested)
+        .withType(data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr)
+        .withColumn(data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    nested->enumerateStreams(path, callback, next_data);
+    nested->enumerateStreams(settings, callback, next_data);
 }
 
 void SerializationMap::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h
index 93b3e179499..42f99ca7991 100644
--- a/src/DataTypes/Serializations/SerializationMap.h
+++ b/src/DataTypes/Serializations/SerializationMap.h
@@ -32,7 +32,7 @@ public:
     void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp
index 097e9cedfbe..4dac4b3a922 100644
--- a/src/DataTypes/Serializations/SerializationNamed.cpp
+++ b/src/DataTypes/Serializations/SerializationNamed.cpp
@@ -4,16 +4,16 @@ namespace DB
 {
 
 void SerializationNamed::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    addToPath(path);
-    path.back().data = data;
-    path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
+    addToPath(settings.path);
+    settings.path.back().data = data;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
 
-    nested_serialization->enumerateStreams(path, callback, data);
-    path.pop_back();
+    nested_serialization->enumerateStreams(settings, callback, data);
+    settings.path.pop_back();
 }
 
 void SerializationNamed::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationNamed.h b/src/DataTypes/Serializations/SerializationNamed.h
index 343b96c16e3..2a2c7c0dfc7 100644
--- a/src/DataTypes/Serializations/SerializationNamed.h
+++ b/src/DataTypes/Serializations/SerializationNamed.h
@@ -26,7 +26,7 @@ public:
     const String & getElementName() const { return name; }
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp
index a6273deaa30..47780f67800 100644
--- a/src/DataTypes/Serializations/SerializationNullable.cpp
+++ b/src/DataTypes/Serializations/SerializationNullable.cpp
@@ -38,38 +38,34 @@ ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev
 }
 
 void SerializationNullable::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * type_nullable = data.type ? &assert_cast<const DataTypeNullable &>(*data.type) : nullptr;
     const auto * column_nullable = data.column ? &assert_cast<const ColumnNullable &>(*data.column) : nullptr;
 
-    path.push_back(Substream::NullMap);
-    path.back().data =
-    {
-        std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
-        type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr,
-        column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto null_map_serialization = std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false);
 
-    callback(path);
+    settings.path.push_back(Substream::NullMap);
+    settings.path.back().data = SubstreamData(null_map_serialization)
+        .withType(type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr)
+        .withColumn(column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    path.back() = Substream::NullableElements;
-    path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column);
-    path.back().data = data;
+    callback(settings.path);
 
-    SubstreamData next_data =
-    {
-        nested,
-        type_nullable ? type_nullable->getNestedType() : nullptr,
-        column_nullable ? column_nullable->getNestedColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.back() = Substream::NullableElements;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(settings.path.back().data.column);
+    settings.path.back().data = data;
 
-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    auto next_data = SubstreamData(nested)
+        .withType(type_nullable ? type_nullable->getNestedType() : nullptr)
+        .withColumn(column_nullable ? column_nullable->getNestedColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
+
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }
 
 void SerializationNullable::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h
index e6e0e4f33c2..ea3958065e7 100644
--- a/src/DataTypes/Serializations/SerializationNullable.h
+++ b/src/DataTypes/Serializations/SerializationNullable.h
@@ -14,7 +14,7 @@ public:
     explicit SerializationNullable(const SerializationPtr & nested_) : nested(nested_) {}
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp
index 64db248c5fc..db194007af9 100644
--- a/src/DataTypes/Serializations/SerializationSparse.cpp
+++ b/src/DataTypes/Serializations/SerializationSparse.cpp
@@ -148,39 +148,32 @@ ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev)
 }
 
 void SerializationSparse::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr;
-
     size_t column_size = column_sparse ? column_sparse->size() : 0;
 
-    path.push_back(Substream::SparseOffsets);
-    path.back().data =
-    {
-        std::make_shared<SerializationNumber<UInt64>>(),
-        data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
-        column_sparse ? column_sparse->getOffsetsPtr() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.push_back(Substream::SparseOffsets);
+    settings.path.back().data = SubstreamData(std::make_shared<SerializationNumber<UInt64>>())
+        .withType(data.type ? std::make_shared<DataTypeUInt64>() : nullptr)
+        .withColumn(column_sparse ? column_sparse->getOffsetsPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    callback(path);
+    callback(settings.path);
 
-    path.back() = Substream::SparseElements;
-    path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column, column_size);
-    path.back().data = data;
+    settings.path.back() = Substream::SparseElements;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(settings.path.back().data.column, column_size);
+    settings.path.back().data = data;
 
-    SubstreamData next_data =
-    {
-        nested,
-        data.type,
-        column_sparse ? column_sparse->getValuesPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto next_data = SubstreamData(nested)
+        .withType(data.type)
+        .withColumn(column_sparse ? column_sparse->getValuesPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }
 
 void SerializationSparse::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationSparse.h b/src/DataTypes/Serializations/SerializationSparse.h
index 54ab4853360..dc2f63c5a05 100644
--- a/src/DataTypes/Serializations/SerializationSparse.h
+++ b/src/DataTypes/Serializations/SerializationSparse.h
@@ -28,7 +28,7 @@ public:
     Kind getKind() const override { return Kind::SPARSE; }
 
     virtual void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp
index 8dc15fc9841..437324d96fd 100644
--- a/src/DataTypes/Serializations/SerializationTuple.cpp
+++ b/src/DataTypes/Serializations/SerializationTuple.cpp
@@ -283,7 +283,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
 }
 
 void SerializationTuple::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
@@ -293,15 +293,12 @@ void SerializationTuple::enumerateStreams(
 
     for (size_t i = 0; i < elems.size(); ++i)
     {
-        SubstreamData next_data =
-        {
-            elems[i],
-            type_tuple ? type_tuple->getElement(i) : nullptr,
-            column_tuple ? column_tuple->getColumnPtr(i) : nullptr,
-            info_tuple ? info_tuple->getElementInfo(i) : nullptr,
-        };
+        auto next_data = SubstreamData(elems[i])
+            .withType(type_tuple ? type_tuple->getElement(i) : nullptr)
+            .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr)
+            .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr);
 
-        elems[i]->enumerateStreams(path, callback, next_data);
+        elems[i]->enumerateStreams(settings, callback, next_data);
     }
 }
 
diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h
index e82d8473645..d1caeb73dad 100644
--- a/src/DataTypes/Serializations/SerializationTuple.h
+++ b/src/DataTypes/Serializations/SerializationTuple.h
@@ -34,7 +34,7 @@ public:
     /** Each sub-column in a tuple is serialized in separate stream.
       */
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp
index 271c53dfcf1..7c50c1c6e26 100644
--- a/src/DataTypes/Serializations/SerializationWrapper.cpp
+++ b/src/DataTypes/Serializations/SerializationWrapper.cpp
@@ -5,11 +5,11 @@ namespace DB
 {
 
 void SerializationWrapper::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    nested_serialization->enumerateStreams(path, callback, data);
+    nested_serialization->enumerateStreams(settings, callback, data);
 }
 
 void SerializationWrapper::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h
index 43fc7e9914a..d010c6b5314 100644
--- a/src/DataTypes/Serializations/SerializationWrapper.h
+++ b/src/DataTypes/Serializations/SerializationWrapper.h
@@ -21,7 +21,7 @@ public:
     Kind getKind() const override { return nested_serialization->getKind(); }
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/Interpreters/InterpreterDescribeQuery.cpp b/src/Interpreters/InterpreterDescribeQuery.cpp
index 9919b1272bd..0524feea1f6 100644
--- a/src/Interpreters/InterpreterDescribeQuery.cpp
+++ b/src/Interpreters/InterpreterDescribeQuery.cpp
@@ -163,7 +163,7 @@ BlockIO InterpreterDescribeQuery::execute()
                     res_columns[6]->insertDefault();
 
                 res_columns[7]->insert(1u);
-            }, { type->getDefaultSerialization(), type, nullptr, nullptr });
+            }, ISerialization::SubstreamData(type->getDefaultSerialization()).withType(type));
         }
     }
 
diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp
index 61d51da4eab..e4f1b46fc91 100644
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@@ -199,11 +199,11 @@ static bool arrayHasNoElementsRead(const IColumn & column)
     if (!size)
         return false;
 
-    if (const auto * nested_array = typeid_cast<const ColumnArray *>(&column_array->getData()))
+    const auto & array_data = column_array->getData();
+    if (const auto * nested_array = typeid_cast<const ColumnArray *>(&array_data))
         return arrayHasNoElementsRead(*nested_array);
 
-    size_t data_size = column_array->getData().size();
-    if (data_size)
+    if (!array_data.empty())
         return false;
 
     size_t last_offset = column_array->getOffsets()[size - 1];
@@ -238,8 +238,7 @@ void fillMissingColumns(
         auto serialization = IDataType::getSerialization(*available_column);
         auto name_in_storage = Nested::extractTableName(available_column->name);
 
-        ISerialization::SubstreamPath path;
-        serialization->enumerateStreams(path, [&](const auto & subpath)
+        serialization->enumerateStreams([&](const auto & subpath)
         {
             if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
                 return;
@@ -247,16 +246,15 @@ void fillMissingColumns(
             auto subname = ISerialization::getSubcolumnNameForStream(subpath);
             auto & offsets_column = offset_columns[Nested::concatenateName(name_in_storage, subname)];
 
-            /// If for some reason multiple offsets columns are present for the same nested data structure,
-            /// choose the one that is not empty.
-            /// TODO: more optimal
+            /// If for some reason multiple offsets columns are present
+            /// for the same nested data structure, choose the one that is not empty.
             if (!offsets_column || offsets_column->empty())
-                offsets_column = arraySizesToOffsets(*subpath.back().data.column);
+                offsets_column = subpath.back().data.column;
 
-        }, {serialization, available_column->type, res_columns[i], nullptr});
+        }, available_column->type, res_columns[i]);
     }
 
-    /// insert default values only for columns without default expressions
+    /// Insert default values only for columns without default expressions.
     auto requested_column = requested_columns.begin();
     for (size_t i = 0; i < num_columns; ++i, ++requested_column)
     {
@@ -279,8 +277,7 @@ void fillMissingColumns(
                 auto serialization = IDataType::getSerialization(*requested_column);
                 auto name_in_storage = Nested::extractTableName(requested_column->name);
 
-                ISerialization::SubstreamPath path;
-                serialization->enumerateStreams(path, [&](const auto & subpath)
+                serialization->enumerateStreams([&](const auto & subpath)
                 {
                     if (!has_all_offsets)
                         return;
@@ -295,7 +292,7 @@ void fillMissingColumns(
                     else
                         has_all_offsets = false;
 
-                }, {serialization, type, nullptr, nullptr});
+                }, type);
             }
 
             if (array_type && has_all_offsets)
diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp
index 7a43ae7af4b..e9f4c1b92aa 100644
--- a/src/Storages/ColumnsDescription.cpp
+++ b/src/Storages/ColumnsDescription.cpp
@@ -780,7 +780,7 @@ void ColumnsDescription::addSubcolumns(const String & name_in_storage, const Dat
                 "Cannot add subcolumn {}: column with this name already exists", subcolumn.name);
 
         subcolumns.get<0>().insert(std::move(subcolumn));
-    }, {type_in_storage->getDefaultSerialization(), type_in_storage, nullptr, nullptr});
+    }, ISerialization::SubstreamData(type_in_storage->getDefaultSerialization()).withType(type_in_storage));
 }
 
 void ColumnsDescription::removeSubcolumns(const String & name_in_storage)
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
index a4786570fcb..bbdd2734c98 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
@@ -67,8 +67,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column,
         compressed_streams.emplace(stream_name, stream);
     };
 
-    ISerialization::SubstreamPath path;
-    data_part->getSerialization(column)->enumerateStreams(path, callback, column.type);
+    data_part->getSerialization(column)->enumerateStreams(callback, column.type);
 }
 
 namespace
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
index 6610b8fc06b..7c08e127ab4 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
@@ -119,8 +119,7 @@ void MergeTreeDataPartWriterWide::addStreams(
             settings.query_write_settings);
     };
 
-    ISerialization::SubstreamPath path;
-    data_part->getSerialization(column)->enumerateStreams(path, callback, column.type);
+    data_part->getSerialization(column)->enumerateStreams(callback, column.type);
 }
 
 
@@ -254,10 +253,9 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm
 void MergeTreeDataPartWriterWide::writeSingleMark(
     const NameAndTypePair & column,
     WrittenOffsetColumns & offset_columns,
-    size_t number_of_rows,
-    ISerialization::SubstreamPath & path)
+    size_t number_of_rows)
 {
-    StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns, path);
+    StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns);
     for (const auto & mark : marks)
         flushMarkToFile(mark, number_of_rows);
 }
@@ -273,8 +271,7 @@ void MergeTreeDataPartWriterWide::flushMarkToFile(const StreamNameAndMark & stre
 
 StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
     const NameAndTypePair & column,
-    WrittenOffsetColumns & offset_columns,
-    ISerialization::SubstreamPath & path)
+    WrittenOffsetColumns & offset_columns)
 {
     StreamsWithMarks result;
     data_part->getSerialization(column)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
@@ -299,7 +296,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
         stream_with_mark.mark.offset_in_decompressed_block = stream.compressed.offset();
 
         result.push_back(stream_with_mark);
-    }, path);
+    });
 
     return result;
 }
@@ -327,7 +324,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule(
             return;
 
         column_streams[stream_name]->compressed.nextIfAtEnd();
-    }, serialize_settings.path);
+    });
 }
 
 /// Column must not be empty. (column.size() !== 0)
@@ -365,7 +362,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
         {
             if (last_non_written_marks.contains(name))
                 throw Exception(ErrorCodes::LOGICAL_ERROR, "We have to add new mark for column, but already have non written mark. Current mark {}, total marks {}, offset {}", getCurrentMark(), index_granularity.getMarksCount(), rows_written_in_last_mark);
-            last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, offset_columns, serialize_settings.path);
+            last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, offset_columns);
         }
 
         writeSingleGranule(
@@ -389,7 +386,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
         }
     }
 
-    serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
+    serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
     {
         bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
         if (is_offsets)
@@ -397,7 +394,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
             String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path);
             offset_columns.insert(stream_name);
         }
-    }, serialize_settings.path);
+    });
 }
 
 
@@ -551,7 +548,7 @@ void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksum
             }
 
             if (write_final_mark)
-                writeFinalMark(*it, offset_columns, serialize_settings.path);
+                writeFinalMark(*it, offset_columns);
         }
     }
 
@@ -616,10 +613,9 @@ void MergeTreeDataPartWriterWide::finish(bool sync)
 
 void MergeTreeDataPartWriterWide::writeFinalMark(
     const NameAndTypePair & column,
-    WrittenOffsetColumns & offset_columns,
-    ISerialization::SubstreamPath & path)
+    WrittenOffsetColumns & offset_columns)
 {
-    writeSingleMark(column, offset_columns, 0, path);
+    writeSingleMark(column, offset_columns, 0);
     /// Memoize information about offsets
     data_part->getSerialization(column)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
     {
@@ -629,7 +625,7 @@ void MergeTreeDataPartWriterWide::writeFinalMark(
             String stream_name = ISerialization::getFileNameForStream(column, substream_path);
             offset_columns.insert(stream_name);
         }
-    }, path);
+    });
 }
 
 static void fillIndexGranularityImpl(
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h
index b82fcd652ae..8292a41b902 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h
@@ -60,8 +60,7 @@ private:
     /// Take offsets from column and return as MarkInCompressed file with stream name
     StreamsWithMarks getCurrentMarksForColumn(
         const NameAndTypePair & column,
-        WrittenOffsetColumns & offset_columns,
-        ISerialization::SubstreamPath & path);
+        WrittenOffsetColumns & offset_columns);
 
     /// Write mark to disk using stream and rows count
     void flushMarkToFile(
@@ -72,13 +71,11 @@ private:
     void writeSingleMark(
         const NameAndTypePair & column,
         WrittenOffsetColumns & offset_columns,
-        size_t number_of_rows,
-        ISerialization::SubstreamPath & path);
+        size_t number_of_rows);
 
     void writeFinalMark(
         const NameAndTypePair & column,
-        WrittenOffsetColumns & offset_columns,
-        ISerialization::SubstreamPath & path);
+        WrittenOffsetColumns & offset_columns);
 
     void addStreams(
         const NameAndTypePair & column,
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
index b943c3c8718..6e7a080f418 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@@ -49,22 +49,24 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
 
         column_positions.resize(columns_num);
         read_only_offsets.resize(columns_num);
+
         auto name_and_type = columns.begin();
         for (size_t i = 0; i < columns_num; ++i, ++name_and_type)
         {
-            if (name_and_type->isSubcolumn())
+            auto column_from_part = getColumnFromPart(*name_and_type);
+            auto position = data_part->getColumnPosition(column_from_part.getNameInStorage());
+            bool is_array = isArray(column_from_part.type);
+
+            if (column_from_part.isSubcolumn())
             {
                 auto storage_column_from_part = getColumnFromPart(
-                    {name_and_type->getNameInStorage(), name_and_type->getTypeInStorage()});
+                    {column_from_part.getNameInStorage(), column_from_part.getTypeInStorage()});
 
-                if (!storage_column_from_part.type->tryGetSubcolumnType(name_and_type->getSubcolumnName()))
-                    continue;
+                auto subcolumn_name = column_from_part.getSubcolumnName();
+                if (!storage_column_from_part.type->hasSubcolumn(subcolumn_name))
+                    position.reset();
             }
-
-            auto column_from_part = getColumnFromPart(*name_and_type);
-
-            auto position = data_part->getColumnPosition(column_from_part.getNameInStorage());
-            if (!position && typeid_cast<const DataTypeArray *>(column_from_part.type.get()))
+            else if (!position && is_array)
             {
                 /// If array of Nested column is missing in part,
                 /// we have to read its offsets if they exist.
@@ -221,7 +223,8 @@ void MergeTreeReaderCompact::readData(
 
     auto buffer_getter = [&](const ISerialization::SubstreamPath & substream_path) -> ReadBuffer *
     {
-        if (only_offsets && (substream_path.size() != 1 || substream_path[0].type != ISerialization::Substream::ArraySizes))
+        bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
+        if (only_offsets && !is_offsets)
             return nullptr;
 
         return data_buffer;
diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
index 0f5cf8de669..fa059506954 100644
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@@ -15,7 +15,6 @@ namespace DB
 
 namespace
 {
-    using OffsetColumns = std::map<std::string, ColumnPtr>;
     constexpr auto DATA_FILE_EXTENSION = ".bin";
 }
 
@@ -291,6 +290,7 @@ void MergeTreeReaderWide::readData(
             /* seek_to_start = */false, substream_path, streams, name_and_type, from_mark,
             seek_to_mark, current_task_last_mark, cache);
     };
+
     deserialize_settings.continuous_reading = continue_reading;
     auto & deserialize_state = deserialize_binary_bulk_state_map[name];
 
diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp
index 7f648054da2..5ffca62d9c3 100644
--- a/src/Storages/System/StorageSystemPartsColumns.cpp
+++ b/src/Storages/System/StorageSystemPartsColumns.cpp
@@ -242,7 +242,7 @@ void StorageSystemPartsColumns::processNextStorage(
             IDataType::forEachSubcolumn([&](const auto & subpath, const auto & name, const auto & data)
             {
                 /// We count only final subcolumns, which are represented by files on disk
-                /// and skip intermediate suibcolumns of types Tuple and Nested.
+                /// and skip intermediate subcolumns of types Tuple and Nested.
                 if (isTuple(data.type) || isNested(data.type))
                     return;
 
@@ -270,7 +270,7 @@ void StorageSystemPartsColumns::processNextStorage(
                 subcolumn_data_uncompressed_bytes.push_back(size.data_uncompressed);
                 subcolumn_marks_bytes.push_back(size.marks);
 
-            }, { serialization, column.type, nullptr, nullptr });
+            }, ISerialization::SubstreamData(serialization).withType(column.type));
 
             if (columns_mask[src_index++])
                 columns[res_index++]->insert(subcolumn_names);

From b7eebfc6260a408fb06fb65c67103ce1f4ca1d5c Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Fri, 2 Sep 2022 07:47:12 +0000
Subject: [PATCH 04/42] Correctly check if the node is using system path

---
 src/Common/ZooKeeper/ZooKeeperCommon.cpp   | 15 +++++++++
 src/Common/ZooKeeper/ZooKeeperCommon.h     |  9 ++++++
 src/Coordination/KeeperSnapshotManager.cpp | 37 ++++------------------
 src/Coordination/KeeperStorage.cpp         |  8 ++---
 4 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index b15126f5701..749052cbba3 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -898,4 +898,19 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
     registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
 }
 
+PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
+{
+    using enum PathMatchResult;
+
+    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
+
+    if (second_it != match_to.end())
+        return NOT_MATCH;
+
+    if (first_it == path.end())
+        return EXACT;
+
+    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
+}
+
 }
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h
index 53fabf651fa..9a9700b500b 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@@ -554,4 +554,13 @@ private:
     ZooKeeperRequestFactory();
 };
 
+enum class PathMatchResult
+{
+    NOT_MATCH,
+    EXACT,
+    IS_CHILD
+};
+
+PathMatchResult matchPath(std::string_view path, std::string_view match_to);
+
 }
diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp
index 1e3f37b617f..fe4050eb685 100644
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@@ -13,8 +13,10 @@
 #include <filesystem>
 #include <memory>
 #include <Common/logger_useful.h>
-#include "Coordination/KeeperContext.h"
+#include <Coordination/KeeperContext.h>
 #include <Coordination/KeeperConstants.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+
 
 namespace DB
 {
@@ -146,33 +148,6 @@ namespace
     }
 }
 
-namespace
-{
-
-enum class PathMatchResult
-{
-    NOT_MATCH,
-    EXACT,
-    IS_CHILD
-};
-
-PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
-{
-    using enum PathMatchResult;
-
-    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
-
-    if (second_it != match_to.end())
-        return NOT_MATCH;
-
-    if (first_it == path.end())
-        return EXACT;
-
-    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
-}
-
-}
-
 void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
 {
     writeBinary(static_cast<uint8_t>(snapshot.version), out);
@@ -217,7 +192,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
         const auto & path = it->key;
 
         // write only the root system path because of digest
-        if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD)
+        if (Coordination::matchPath(path.toView(), keeper_system_path) == Coordination::PathMatchResult::IS_CHILD)
         {
             ++it;
             continue;
@@ -365,8 +340,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
         KeeperStorage::Node node{};
         readNode(node, in, current_version, storage.acl_map);
 
-        using enum PathMatchResult;
-        auto match_result = matchPath(path, keeper_system_path);
+        using enum Coordination::PathMatchResult;
+        auto match_result = Coordination::matchPath(path, keeper_system_path);
 
         const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
         if (match_result == IS_CHILD)
diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp
index 397cd2c0c71..1c8bc454c8b 100644
--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@@ -879,7 +879,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
             path_created += seq_num_str.str();
         }
 
-        if (path_created.starts_with(keeper_system_path))
+        if (Coordination::matchPath(path_created, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created);
 
@@ -1049,7 +1049,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
 
         std::vector<KeeperStorage::Delta> new_deltas;
 
-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path);
 
@@ -1203,7 +1203,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce
 
         std::vector<KeeperStorage::Delta> new_deltas;
 
-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
 
@@ -1472,7 +1472,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr
     {
         Coordination::ZooKeeperSetACLRequest & request = dynamic_cast<Coordination::ZooKeeperSetACLRequest &>(*zk_request);
 
-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
 

From ad2196155c9dd21958dac0cc355c96be494bef2e Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Fri, 2 Sep 2022 08:14:06 +0000
Subject: [PATCH 05/42] Add test for system node modification

---
 src/Coordination/tests/gtest_coordination.cpp | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp
index 493e76ee5fc..cdda6fc1f32 100644
--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@@ -2141,6 +2141,38 @@ TEST_P(CoordinationTest, TestCurrentApiVersion)
     EXPECT_EQ(keeper_version, static_cast<uint8_t>(current_keeper_api_version));
 }
 
+TEST_P(CoordinationTest, TestSystemNodeModify)
+{
+    using namespace Coordination;
+    int64_t zxid{0};
+
+    // On INIT we abort when a system path is modified
+    keeper_context->server_state = KeeperContext::Phase::RUNNING;
+    KeeperStorage storage{500, "", keeper_context};
+    const auto assert_create = [&](const std::string_view path, const auto expected_code)
+    {
+        auto request = std::make_shared<ZooKeeperCreateRequest>();
+        request->path = path;
+        storage.preprocessRequest(request, 0, 0, zxid);
+        auto responses = storage.processRequest(request, 0, zxid);
+        ASSERT_FALSE(responses.empty());
+
+        const auto & response = responses[0];
+        ASSERT_EQ(response.response->error, expected_code) << "Unexpected error for path " << path;
+
+        ++zxid;
+    };
+
+    assert_create("/keeper", Error::ZBADARGUMENTS);
+    assert_create("/keeper/with_child", Error::ZBADARGUMENTS);
+    assert_create(DB::keeper_api_version_path, Error::ZBADARGUMENTS);
+
+    assert_create("/keeper_map", Error::ZOK);
+    assert_create("/keeper1", Error::ZOK);
+    assert_create("/keepe", Error::ZOK);
+    assert_create("/keeper1/test", Error::ZOK);
+}
+
 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
     CoordinationTest,
     ::testing::ValuesIn(std::initializer_list<CompressionParam>{

From 464818c142766787854d64fcad41869b71d37460 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Fri, 2 Sep 2022 15:05:58 +0000
Subject: [PATCH 06/42] try to fix filling of missed Nested columns with
 multiple levels

---
 src/Storages/MergeTree/IMergeTreeReader.cpp   | 39 ++++++++---
 src/Storages/MergeTree/IMergeTreeReader.h     |  2 +-
 .../MergeTree/MergeTreeReaderCompact.cpp      | 68 ++++++++++---------
 .../MergeTree/MergeTreeReaderCompact.h        |  1 +
 .../MergeTree/MergeTreeReaderInMemory.cpp     |  8 +--
 5 files changed, 74 insertions(+), 44 deletions(-)

diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp
index 36a1e5e4b55..b77975ff9ad 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@@ -204,17 +204,40 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
     }
 }
 
-IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const String & column_name) const
+IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const NameAndTypePair & required_column) const
 {
-    String table_name = Nested::extractTableName(column_name);
+    auto get_offset_streams = [](const auto & serialization, const auto & name_in_storage)
+    {
+        NameSet offset_streams;
+        serialization->enumerateStreams([&](const auto & subpath)
+        {
+            if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                return;
+
+            auto subname = ISerialization::getSubcolumnNameForStream(subpath);
+            auto full_name = Nested::concatenateName(name_in_storage, subname);
+            offset_streams.insert(full_name);
+        });
+
+        return offset_streams;
+    };
+
+    auto required_name_in_storage = Nested::extractTableName(required_column.getNameInStorage());
+    auto required_offset_streams = get_offset_streams(getSerializationInPart(required_column), required_name_in_storage);
+
     for (const auto & part_column : data_part->getColumns())
     {
-        if (typeid_cast<const DataTypeArray *>(part_column.type.get()))
-        {
-            auto position = data_part->getColumnPosition(part_column.getNameInStorage());
-            if (position && Nested::extractTableName(part_column.name) == table_name)
-                return position;
-        }
+        auto name_in_storage = Nested::extractTableName(part_column.name);
+        if (name_in_storage != required_name_in_storage)
+            continue;
+
+        auto offset_streams = get_offset_streams(data_part->getSerialization(part_column.name), name_in_storage);
+
+        bool has_all_offsets = std::all_of(required_offset_streams.begin(), required_offset_streams.end(),
+            [&](const auto & stream_name) { return offset_streams.contains(stream_name); });
+
+        if (has_all_offsets)
+            return data_part->getColumnPosition(part_column.name);
     }
 
     return {};
diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h
index 453563522a5..fa1bf988fbf 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@@ -91,7 +91,7 @@ protected:
     MarkRanges all_mark_ranges;
 
     using ColumnPosition = std::optional<size_t>;
-    ColumnPosition findColumnForOffsets(const String & column_name) const;
+    ColumnPosition findColumnForOffsets(const NameAndTypePair & column) const;
 
 private:
     /// Alter conversions, which must be applied on fly if required
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
index 28ec2eff56f..87e0d75a871 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@@ -46,37 +46,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
 {
     try
     {
-        size_t columns_num = columns_to_read.size();
-
-        column_positions.resize(columns_num);
-        read_only_offsets.resize(columns_num);
-
-        for (size_t i = 0; i < columns_num; ++i)
-        {
-            const auto & column_to_read = columns_to_read[i];
-
-            auto position = data_part->getColumnPosition(column_to_read.getNameInStorage());
-            bool is_array = isArray(column_to_read.type);
-
-            if (column_to_read.isSubcolumn())
-            {
-                auto storage_column_from_part = getColumnInPart(
-                    {column_to_read.getNameInStorage(), column_to_read.getTypeInStorage()});
-
-                auto subcolumn_name = column_to_read.getSubcolumnName();
-                if (!storage_column_from_part.type->hasSubcolumn(subcolumn_name))
-                    position.reset();
-            }
-            else if (!position && is_array)
-            {
-                /// If array of Nested column is missing in part,
-                /// we have to read its offsets if they exist.
-                position = findColumnForOffsets(column_to_read.name);
-                read_only_offsets[i] = (position != std::nullopt);
-            }
-
-            column_positions[i] = std::move(position);
-        }
+        fillColumnPositions();
 
         /// Do not use max_read_buffer_size, but try to lower buffer size with maximal size of granule to avoid reading much data.
         auto buffer_size = getReadBufferSize(data_part, marks_loader, column_positions, all_mark_ranges);
@@ -139,6 +109,42 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
     }
 }
 
+void MergeTreeReaderCompact::fillColumnPositions()
+{
+    size_t columns_num = columns_to_read.size();
+
+    column_positions.resize(columns_num);
+    read_only_offsets.resize(columns_num);
+
+    for (size_t i = 0; i < columns_num; ++i)
+    {
+        const auto & column_to_read = columns_to_read[i];
+
+        auto position = data_part->getColumnPosition(column_to_read.getNameInStorage());
+        bool is_array = isArray(column_to_read.type);
+
+        if (column_to_read.isSubcolumn())
+        {
+            auto storage_column_from_part = getColumnInPart(
+                {column_to_read.getNameInStorage(), column_to_read.getTypeInStorage()});
+
+            auto subcolumn_name = column_to_read.getSubcolumnName();
+            if (!storage_column_from_part.type->hasSubcolumn(subcolumn_name))
+                position.reset();
+        }
+
+        if (!position && is_array)
+        {
+            /// If array of Nested column is missing in part,
+            /// we have to read its offsets if they exist.
+            position = findColumnForOffsets(column_to_read);
+            read_only_offsets[i] = (position != std::nullopt);
+        }
+
+        column_positions[i] = std::move(position);
+    }
+}
+
 size_t MergeTreeReaderCompact::readRows(
     size_t from_mark, size_t current_task_last_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
 {
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h
index aa0eb949aa1..4c5f1231063 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.h
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h
@@ -39,6 +39,7 @@ public:
 
 private:
     bool isContinuousReading(size_t mark, size_t column_position);
+    void fillColumnPositions();
 
     ReadBuffer * data_buffer;
     CompressedReadBufferBase * compressed_data_buffer;
diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
index 766c28c99b9..568e1cecf02 100644
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
@@ -32,13 +32,13 @@ MergeTreeReaderInMemory::MergeTreeReaderInMemory(
         {})
     , part_in_memory(std::move(data_part_))
 {
-    for (const auto & [name, type] : columns_to_read)
+    for (const auto & column_to_read : columns_to_read)
     {
         /// If array of Nested column is missing in part,
         /// we have to read its offsets if they exist.
-        if (!part_in_memory->block.has(name) && typeid_cast<const DataTypeArray *>(type.get()))
-            if (auto offset_position = findColumnForOffsets(name))
-                positions_for_offsets[name] = *offset_position;
+        if (!part_in_memory->block.has(column_to_read.name) && typeid_cast<const DataTypeArray *>(column_to_read.type.get()))
+            if (auto offset_position = findColumnForOffsets(column_to_read))
+                positions_for_offsets[column_to_read.name] = *offset_position;
     }
 }
 

From dd776eb3d58c239a4b8c6db5817576c0ac446ca5 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Fri, 2 Sep 2022 16:18:50 +0000
Subject: [PATCH 07/42] fix enumerateStreams

---
 src/DataTypes/Serializations/SerializationLowCardinality.cpp | 5 +++--
 src/DataTypes/Serializations/SerializationNullable.cpp       | 5 +++--
 src/DataTypes/Serializations/SerializationSparse.cpp         | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
index 528cd13ddf3..dfe0188c8e7 100644
--- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
@@ -48,12 +48,13 @@ void SerializationLowCardinality::enumerateStreams(
     const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr;
 
     settings.path.push_back(Substream::DictionaryKeys);
-    settings.path.back().data = SubstreamData(dict_inner_serialization)
+    auto dict_data = SubstreamData(dict_inner_serialization)
         .withType(data.type ? dictionary_type : nullptr)
         .withColumn(column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr)
         .withSerializationInfo(data.serialization_info);
 
-    dict_inner_serialization->enumerateStreams(settings, callback, settings.path.back().data);
+    settings.path.back().data = dict_data;
+    dict_inner_serialization->enumerateStreams(settings, callback, dict_data);
 
     settings.path.back() = Substream::DictionaryIndexes;
     settings.path.back().data = data;
diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp
index 47780f67800..560b73bc827 100644
--- a/src/DataTypes/Serializations/SerializationNullable.cpp
+++ b/src/DataTypes/Serializations/SerializationNullable.cpp
@@ -48,15 +48,16 @@ void SerializationNullable::enumerateStreams(
     auto null_map_serialization = std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false);
 
     settings.path.push_back(Substream::NullMap);
-    settings.path.back().data = SubstreamData(null_map_serialization)
+    auto null_map_data = SubstreamData(null_map_serialization)
         .withType(type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr)
         .withColumn(column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr)
         .withSerializationInfo(data.serialization_info);
 
+    settings.path.back().data = null_map_data;
     callback(settings.path);
 
     settings.path.back() = Substream::NullableElements;
-    settings.path.back().creator = std::make_shared<SubcolumnCreator>(settings.path.back().data.column);
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(null_map_data.column);
     settings.path.back().data = data;
 
     auto next_data = SubstreamData(nested)
diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp
index 70f9a03f510..855bdfa1b3e 100644
--- a/src/DataTypes/Serializations/SerializationSparse.cpp
+++ b/src/DataTypes/Serializations/SerializationSparse.cpp
@@ -156,15 +156,16 @@ void SerializationSparse::enumerateStreams(
     size_t column_size = column_sparse ? column_sparse->size() : 0;
 
     settings.path.push_back(Substream::SparseOffsets);
-    settings.path.back().data = SubstreamData(std::make_shared<SerializationNumber<UInt64>>())
+    auto offsets_data = SubstreamData(std::make_shared<SerializationNumber<UInt64>>())
         .withType(data.type ? std::make_shared<DataTypeUInt64>() : nullptr)
         .withColumn(column_sparse ? column_sparse->getOffsetsPtr() : nullptr)
         .withSerializationInfo(data.serialization_info);
 
+    settings.path.back().data = offsets_data;
     callback(settings.path);
 
     settings.path.back() = Substream::SparseElements;
-    settings.path.back().creator = std::make_shared<SubcolumnCreator>(settings.path.back().data.column, column_size);
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets_data.column, column_size);
     settings.path.back().data = data;
 
     auto next_data = SubstreamData(nested)

From 25e3bebd9d3cb64d8e42172f6f46ec489261109f Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Mon, 29 Aug 2022 20:36:13 +0200
Subject: [PATCH 08/42] Rework core collecting on CI (eliminate gcore usage)

gcore is a gdb command, that internally uses gdb to dump the core.

However with proper configuration of limits (core_dump.size_limit) it
should not be required, althought some issues is possible:
- non standard kernel.core_pattern
- sanitizers

So yes, gcore is more "universal" (you don't need to configure any
`kernel_pattern`), but it is ad-hoc, and it has drawbacks -
**it does not work when gdb fails**. For example gdb may fail with
`Dwarf Error: DW_FORM_strx1 found in non-DWO CU` in case of DWARF-5 [1].

  [1]: https://github.com/ClickHouse/ClickHouse/pull/40772#issuecomment-1236331323.

Let's try to switch to more native way.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 docker/test/fuzzer/run-fuzzer.sh | 23 +++++++++++++++++++++--
 docker/test/stress/run.sh        | 20 +++++++++++++++++---
 tests/ci/ast_fuzzer_check.py     |  6 +++++-
 tests/ci/stress_check.py         |  2 +-
 4 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index 11ddb0bd2d3..93e38260395 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -1,8 +1,15 @@
 #!/bin/bash
 # shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031
 
-set -eux
+set -x
+
+# core.COMM.PID-TID
+sysctl kernel.core_pattern='core.%e.%p-%P'
+
+set -e
+set -u
 set -o pipefail
+
 trap "exit" INT TERM
 # The watchdog is in the separate process group, so we have to kill it separately
 # if the script terminates earlier.
@@ -87,6 +94,19 @@ function configure
     # TODO figure out which ones are needed
     cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d
     cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
+
+    cat > db/config.d/core.xml <<EOL
+<clickhouse>
+    <core_dump>
+        <!-- 100GiB -->
+        <size_limit>107374182400</size_limit>
+    </core_dump>
+    <!-- NOTE: no need to configure core_path,
+         since clickhouse is not started as daemon (via clickhouse start)
+    -->
+    <core_path>$PWD</core_path>
+</clickhouse>
+EOL
 }
 
 function watchdog
@@ -180,7 +200,6 @@ handle SIGUSR2 nostop noprint pass
 handle SIG$RTMIN nostop noprint pass
 info signals
 continue
-gcore
 backtrace full
 thread apply all backtrace full
 info registers
diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh
index f8ecdf1aa21..a28785084f1 100755
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@@ -5,6 +5,9 @@
 
 set -x
 
+# core.COMM.PID-TID
+sysctl kernel.core_pattern='core.%e.%p-%P'
+
 # Thread Fuzzer allows to check more permutations of possible thread scheduling
 # and find more potential issues.
 
@@ -99,6 +102,19 @@ EOL
         </default>
     </profiles>
 </clickhouse>
+EOL
+
+    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
+<clickhouse>
+    <core_dump>
+        <!-- 100GiB -->
+        <size_limit>107374182400</size_limit>
+    </core_dump>
+    <!-- NOTE: no need to configure core_path,
+         since clickhouse is not started as daemon (via clickhouse start)
+    -->
+    <core_path>$PWD</core_path>
+</clickhouse>
 EOL
 }
 
@@ -155,7 +171,6 @@ handle SIGUSR2 nostop noprint pass
 handle SIG$RTMIN nostop noprint pass
 info signals
 continue
-gcore
 backtrace full
 thread apply all backtrace full
 info registers
@@ -467,8 +482,7 @@ done
 clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%'), rowNumberInAllBlocks() LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv
 [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv
 
-# Core dumps (see gcore)
-# Default filename is 'core.PROCESS_ID'
+# Core dumps
 for core in core.*; do
     pigz $core
     mv $core.gz /test_output/
diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py
index 9f3ddbe9932..0e41aaf8fba 100644
--- a/tests/ci/ast_fuzzer_check.py
+++ b/tests/ci/ast_fuzzer_check.py
@@ -29,7 +29,11 @@ IMAGE_NAME = "clickhouse/fuzzer"
 
 def get_run_command(pr_number, sha, download_url, workspace_path, image):
     return (
-        f"docker run --network=host --volume={workspace_path}:/workspace "
+        f"docker run "
+        # For sysctl
+        "--privileged "
+        "--network=host "
+        f"--volume={workspace_path}:/workspace "
         "--cap-add syslog --cap-add sys_admin --cap-add=SYS_PTRACE "
         f'-e PR_TO_TEST={pr_number} -e SHA_TO_TEST={sha} -e BINARY_URL_TO_DOWNLOAD="{download_url}" '
         f"{image}"
diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py
index e644eef3bc8..8f310eaa99d 100644
--- a/tests/ci/stress_check.py
+++ b/tests/ci/stress_check.py
@@ -33,7 +33,7 @@ def get_run_command(
         "docker run --cap-add=SYS_PTRACE "
         # a static link, don't use S3_URL or S3_DOWNLOAD
         "-e S3_URL='https://s3.amazonaws.com/clickhouse-datasets' "
-        # For dmesg
+        # For dmesg and sysctl
         "--privileged "
         f"--volume={build_path}:/package_folder "
         f"--volume={result_folder}:/test_output "

From 7a8b8e7a39a663042a0a00c86694ca59ea76fc59 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <igor@clickhouse.com>
Date: Mon, 5 Sep 2022 20:47:11 +0000
Subject: [PATCH 09/42] Optimizer setting: read in window order

optimization's setting is checked before applying it, not inside the optimization code
---
 src/Processors/QueryPlan/Optimizations/Optimizations.h     | 2 +-
 .../Optimizations/QueryPlanOptimizationSettings.cpp        | 1 +
 .../Optimizations/QueryPlanOptimizationSettings.h          | 3 +++
 .../reuseStorageOrderingForWindowFunctions.cpp             | 7 +------
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h
index 904f30e84b0..f45200f3026 100644
--- a/src/Processors/QueryPlan/Optimizations/Optimizations.h
+++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h
@@ -63,7 +63,7 @@ inline const auto & getOptimizations()
         {tryMergeExpressions, "mergeExpressions", &QueryPlanOptimizationSettings::optimize_plan},
         {tryPushDownFilter, "pushDownFilter", &QueryPlanOptimizationSettings::filter_push_down},
         {tryExecuteFunctionsAfterSorting, "liftUpFunctions", &QueryPlanOptimizationSettings::optimize_plan},
-        {tryReuseStorageOrderingForWindowFunctions, "reuseStorageOrderingForWindowFunctions", &QueryPlanOptimizationSettings::optimize_plan}
+        {tryReuseStorageOrderingForWindowFunctions, "reuseStorageOrderingForWindowFunctions", &QueryPlanOptimizationSettings::optimize_read_in_window_order}
     }};
 
     return optimizations;
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
index 1472fb87a89..f9707b973e4 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
@@ -11,6 +11,7 @@ QueryPlanOptimizationSettings QueryPlanOptimizationSettings::fromSettings(const
     settings.optimize_plan = from.query_plan_enable_optimizations;
     settings.max_optimizations_to_apply = from.query_plan_max_optimizations_to_apply;
     settings.filter_push_down = from.query_plan_filter_push_down;
+    settings.optimize_read_in_window_order = from.optimize_read_in_window_order;
     return settings;
 }
 
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
index b5a37bf69d6..99e52b60a73 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
@@ -21,6 +21,9 @@ struct QueryPlanOptimizationSettings
     /// If filter push down optimization is enabled.
     bool filter_push_down = true;
 
+    /// window functions read in order optimization
+    bool optimize_read_in_window_order = true;
+
     static QueryPlanOptimizationSettings fromSettings(const Settings & from);
     static QueryPlanOptimizationSettings fromContext(ContextPtr from);
 };
diff --git a/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
index 401774b390e..8377b62c947 100644
--- a/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
+++ b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
@@ -61,12 +61,7 @@ size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node,
         return 0;
     }
 
-    auto context = read_from_merge_tree->getContext();
-    if (!context->getSettings().optimize_read_in_window_order)
-    {
-        return 0;
-    }
-
+    const auto context = read_from_merge_tree->getContext();
     const auto & query_info = read_from_merge_tree->getQueryInfo();
     const auto * select_query = query_info.query->as<ASTSelectQuery>();
 

From f76c1482bd148d5282222eff468bccc350421b3a Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Tue, 6 Sep 2022 13:56:32 +0000
Subject: [PATCH 10/42] try to fix filling of missed Nested columns with
 multiple levels

---
 src/Interpreters/inplaceBlockConversions.cpp  | 179 ++++++++++--------
 src/Interpreters/inplaceBlockConversions.h    |   2 +
 src/Storages/MergeTree/IMergeTreeReader.cpp   |  37 ++--
 src/Storages/MergeTree/IMergeTreeReader.h     |   2 +
 .../MergeTree/MergeTreeReaderCompact.cpp      |   2 +
 .../MergeTree/MergeTreeReaderWide.cpp         |  13 ++
 src/Storages/StorageMemory.cpp                |   2 +-
 .../0_stateless/01825_type_json_17.reference  |  20 ++
 .../0_stateless/01825_type_json_17.sql        |  32 +++-
 9 files changed, 191 insertions(+), 98 deletions(-)

diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp
index e4f1b46fc91..116cd91c7cf 100644
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@@ -188,49 +188,13 @@ ActionsDAGPtr evaluateMissingDefaults(
     return createExpressions(header, expr_list, save_unneeded_columns, context);
 }
 
-static bool arrayHasNoElementsRead(const IColumn & column)
+static std::unordered_map<String, ColumnPtr> collectOffsetsColumns(
+    const NamesAndTypesList & available_columns, const Columns & res_columns)
 {
-    const auto * column_array = typeid_cast<const ColumnArray *>(&column);
+    std::unordered_map<String, ColumnPtr> offsets_columns;
 
-    if (!column_array)
-        return false;
-
-    size_t size = column_array->size();
-    if (!size)
-        return false;
-
-    const auto & array_data = column_array->getData();
-    if (const auto * nested_array = typeid_cast<const ColumnArray *>(&array_data))
-        return arrayHasNoElementsRead(*nested_array);
-
-    if (!array_data.empty())
-        return false;
-
-    size_t last_offset = column_array->getOffsets()[size - 1];
-    return last_offset != 0;
-}
-
-void fillMissingColumns(
-    Columns & res_columns,
-    size_t num_rows,
-    const NamesAndTypesList & requested_columns,
-    const NamesAndTypesList & available_columns,
-    StorageMetadataPtr metadata_snapshot)
-{
-    size_t num_columns = requested_columns.size();
-    if (num_columns != res_columns.size())
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-            "Invalid number of columns passed to fillMissingColumns. Expected {}, got {}",
-            num_columns, res_columns.size());
-
-    /// For a missing column of a nested data structure we must create not a column of empty
-    /// arrays, but a column of arrays of correct length.
-
-    /// First, collect offset columns for all arrays in the block.
-
-    std::unordered_map<String, ColumnPtr> offset_columns;
     auto available_column = available_columns.begin();
-    for (size_t i = 0; i < num_columns; ++i, ++available_column)
+    for (size_t i = 0; i < available_columns.size(); ++i, ++available_column)
     {
         if (res_columns[i] == nullptr || isColumnConst(*res_columns[i]))
             continue;
@@ -243,75 +207,122 @@ void fillMissingColumns(
             if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
                 return;
 
-            auto subname = ISerialization::getSubcolumnNameForStream(subpath);
-            auto & offsets_column = offset_columns[Nested::concatenateName(name_in_storage, subname)];
+            const auto & current_offsets_column = subpath.back().data.column;
 
             /// If for some reason multiple offsets columns are present
             /// for the same nested data structure, choose the one that is not empty.
-            if (!offsets_column || offsets_column->empty())
-                offsets_column = subpath.back().data.column;
+            if (current_offsets_column && !current_offsets_column->empty())
+            {
+                auto subname = ISerialization::getSubcolumnNameForStream(subpath);
+                auto full_name = Nested::concatenateName(name_in_storage, subname);
+                auto & offsets_column = offsets_columns[full_name];
 
+                if (!offsets_column)
+                    offsets_column = current_offsets_column;
+
+            #ifndef NDEBUG
+                const auto & offsets_data = assert_cast<const ColumnUInt64 &>(*offsets_column).getData();
+                const auto & current_offsets_data = assert_cast<const ColumnUInt64 &>(*current_offsets_column).getData();
+
+                if (offsets_data != current_offsets_data)
+                    throw Exception(ErrorCodes::LOGICAL_ERROR,
+                        "Found non-equal columns with offsets (sizes: {} and {}) for stream {}",
+                        offsets_data.size(), current_offsets_data.size(), full_name);
+            #endif
+            }
         }, available_column->type, res_columns[i]);
     }
 
+    return offsets_columns;
+}
+
+void fillMissingColumns(
+    Columns & res_columns,
+    size_t num_rows,
+    const NamesAndTypesList & requested_columns,
+    const NamesAndTypesList & available_columns,
+    const NameSet & partially_read_columns,
+    StorageMetadataPtr metadata_snapshot)
+{
+    size_t num_columns = requested_columns.size();
+    if (num_columns != res_columns.size())
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+            "Invalid number of columns passed to fillMissingColumns. Expected {}, got {}",
+            num_columns, res_columns.size());
+
+    /// For a missing column of a nested data structure
+    /// we must create not a column of empty arrays,
+    /// but a column of arrays of correct length.
+
+    /// First, collect offset columns for all arrays in the block.
+    auto offset_columns = collectOffsetsColumns(available_columns, res_columns);
+
     /// Insert default values only for columns without default expressions.
     auto requested_column = requested_columns.begin();
     for (size_t i = 0; i < num_columns; ++i, ++requested_column)
     {
         const auto & [name, type] = *requested_column;
 
-        if (res_columns[i] && arrayHasNoElementsRead(*res_columns[i]))
+        if (res_columns[i] && partially_read_columns.contains(name))
             res_columns[i] = nullptr;
 
-        if (res_columns[i] == nullptr)
+        if (res_columns[i])
+            continue;
+
+        if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
+            continue;
+
+        std::vector<ColumnPtr> current_offsets;
+        size_t num_dimensions = 0;
+
+        if (const auto * array_type = typeid_cast<const DataTypeArray *>(type.get()))
         {
-            if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
-                continue;
+            num_dimensions = getNumberOfDimensions(*array_type);
+            current_offsets.resize(num_dimensions);
 
-            std::vector<ColumnPtr> current_offsets;
-            bool has_all_offsets = true;
+            auto serialization = IDataType::getSerialization(*requested_column);
+            auto name_in_storage = Nested::extractTableName(requested_column->name);
 
-            const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
-            if (array_type)
+            serialization->enumerateStreams([&](const auto & subpath)
             {
-                auto serialization = IDataType::getSerialization(*requested_column);
-                auto name_in_storage = Nested::extractTableName(requested_column->name);
+                if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                    return;
 
-                serialization->enumerateStreams([&](const auto & subpath)
+                size_t level = ISerialization::getArrayLevel(subpath);
+                assert(level < num_dimensions);
+
+                auto subname = ISerialization::getSubcolumnNameForStream(subpath);
+                auto it = offset_columns.find(Nested::concatenateName(name_in_storage, subname));
+                if (it != offset_columns.end())
+                    current_offsets[level] = it->second;
+            });
+
+            for (size_t j = 0; j < num_dimensions; ++j)
+            {
+                if (!current_offsets[j])
                 {
-                    if (!has_all_offsets)
-                        return;
-
-                    if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
-                        return;
-
-                    auto subname = ISerialization::getSubcolumnNameForStream(subpath);
-                    auto it = offset_columns.find(Nested::concatenateName(name_in_storage, subname));
-                    if (it != offset_columns.end())
-                        current_offsets.emplace_back(it->second);
-                    else
-                        has_all_offsets = false;
-
-                }, type);
+                    current_offsets.resize(j);
+                    break;
+                }
             }
+        }
 
-            if (array_type && has_all_offsets)
-            {
-                assert(!current_offsets.empty());
-                auto scalar_type = getBaseTypeOfArray(type);
+        if (!current_offsets.empty())
+        {
+            size_t num_empty_dimensions = num_dimensions - current_offsets.size();
+            auto scalar_type = createArrayOfType(getBaseTypeOfArray(type), num_empty_dimensions);
 
-                size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
-                res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst();
+            size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
+            res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst();
 
-                for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
-                    res_columns[i] = ColumnArray::create(res_columns[i], *it);
-            }
-            else
-            {
-                /// We must turn a constant column into a full column because the interpreter could infer
-                /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
-                res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
-            }
+            for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
+                res_columns[i] = ColumnArray::create(res_columns[i], *it);
+        }
+        else
+        {
+            /// We must turn a constant column into a full column because the interpreter could infer
+            /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
+            res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
         }
     }
 }
diff --git a/src/Interpreters/inplaceBlockConversions.h b/src/Interpreters/inplaceBlockConversions.h
index 70187d5aace..bea44bf6db9 100644
--- a/src/Interpreters/inplaceBlockConversions.h
+++ b/src/Interpreters/inplaceBlockConversions.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <Core/Names.h>
 #include <Interpreters/Context_fwd.h>
 #include <Common/COW.h>
 
@@ -44,6 +45,7 @@ void fillMissingColumns(
     size_t num_rows,
     const NamesAndTypesList & requested_columns,
     const NamesAndTypesList & available_columns,
+    const NameSet & partially_read_columns,
     StorageMetadataPtr metadata_snapshot);
 
 }
diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp
index b77975ff9ad..73f9cc8b75d 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@@ -65,7 +65,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
     try
     {
         NamesAndTypesList available_columns(columns_to_read.begin(), columns_to_read.end());
-        DB::fillMissingColumns(res_columns, num_rows, requested_columns, available_columns, metadata_snapshot);
+        DB::fillMissingColumns(res_columns, num_rows, requested_columns, available_columns, partially_read_columns, metadata_snapshot);
 
         should_evaluate_missing_defaults = std::any_of(
             res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; });
@@ -206,9 +206,9 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
 
 IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const NameAndTypePair & required_column) const
 {
-    auto get_offset_streams = [](const auto & serialization, const auto & name_in_storage)
+    auto get_offsets_streams = [](const auto & serialization, const auto & name_in_storage)
     {
-        NameSet offset_streams;
+        Names offsets_streams;
         serialization->enumerateStreams([&](const auto & subpath)
         {
             if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
@@ -216,31 +216,44 @@ IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const Na
 
             auto subname = ISerialization::getSubcolumnNameForStream(subpath);
             auto full_name = Nested::concatenateName(name_in_storage, subname);
-            offset_streams.insert(full_name);
+            offsets_streams.push_back(full_name);
         });
 
-        return offset_streams;
+        return offsets_streams;
     };
 
     auto required_name_in_storage = Nested::extractTableName(required_column.getNameInStorage());
-    auto required_offset_streams = get_offset_streams(getSerializationInPart(required_column), required_name_in_storage);
+    auto required_offsets_streams = get_offsets_streams(getSerializationInPart(required_column), required_name_in_storage);
 
+    size_t max_matched_streams = 0;
+    ColumnPosition position;
+
+    /// Find column that has maximal number of matching
+    /// offsets columns with required_column.
     for (const auto & part_column : data_part->getColumns())
     {
         auto name_in_storage = Nested::extractTableName(part_column.name);
         if (name_in_storage != required_name_in_storage)
             continue;
 
-        auto offset_streams = get_offset_streams(data_part->getSerialization(part_column.name), name_in_storage);
+        auto offsets_streams = get_offsets_streams(data_part->getSerialization(part_column.name), name_in_storage);
+        NameSet offsets_streams_set(offsets_streams.begin(), offsets_streams.end());
 
-        bool has_all_offsets = std::all_of(required_offset_streams.begin(), required_offset_streams.end(),
-            [&](const auto & stream_name) { return offset_streams.contains(stream_name); });
+        size_t i = 0;
+        for (; i < required_offsets_streams.size(); ++i)
+        {
+            if (!offsets_streams_set.contains(required_offsets_streams[i]))
+                break;
+        }
 
-        if (has_all_offsets)
-            return data_part->getColumnPosition(part_column.name);
+        if (i && (!position || i > max_matched_streams))
+        {
+            max_matched_streams = i;
+            position = data_part->getColumnPosition(part_column.name);
+        }
     }
 
-    return {};
+    return position;
 }
 
 void IMergeTreeReader::checkNumberOfColumns(size_t num_columns_to_read) const
diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h
index fa1bf988fbf..c2fab33bcf9 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@@ -93,6 +93,8 @@ protected:
     using ColumnPosition = std::optional<size_t>;
     ColumnPosition findColumnForOffsets(const NameAndTypePair & column) const;
 
+    NameSet partially_read_columns;
+
 private:
     /// Alter conversions, which must be applied on fly if required
     MergeTreeData::AlterConversions alter_conversions;
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
index 87e0d75a871..440879a9340 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@@ -142,6 +142,8 @@ void MergeTreeReaderCompact::fillColumnPositions()
         }
 
         column_positions[i] = std::move(position);
+        if (read_only_offsets[i])
+            partially_read_columns.insert(column_to_read.name);
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
index abc3816c386..f1124c674c1 100644
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@@ -159,12 +159,18 @@ void MergeTreeReaderWide::addStreams(
     const ReadBufferFromFileBase::ProfileCallback & profile_callback,
     clockid_t clock_type)
 {
+    bool has_any_stream = false;
+    bool has_all_streams = true;
+
     ISerialization::StreamCallback callback = [&] (const ISerialization::SubstreamPath & substream_path)
     {
         String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path);
 
         if (streams.contains(stream_name))
+        {
+            has_any_stream = true;
             return;
+        }
 
         bool data_file_exists = data_part->checksums.files.contains(stream_name + DATA_FILE_EXTENSION);
 
@@ -172,8 +178,12 @@ void MergeTreeReaderWide::addStreams(
           * It is necessary since it allows to add new column to structure of the table without creating new files for old parts.
           */
         if (!data_file_exists)
+        {
+            has_all_streams = false;
             return;
+        }
 
+        has_any_stream = true;
         bool is_lc_dict = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys;
 
         streams.emplace(stream_name, std::make_unique<MergeTreeReaderStream>(
@@ -185,6 +195,9 @@ void MergeTreeReaderWide::addStreams(
     };
 
     serialization->enumerateStreams(callback);
+
+    if (has_any_stream && !has_all_streams)
+        partially_read_columns.insert(name_and_type.name);
 }
 
 
diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index d8a46b07102..e4dbfe15095 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -95,7 +95,7 @@ protected:
             ++name_and_type;
         }
 
-        fillMissingColumns(columns, src.rows(), column_names_and_types, column_names_and_types, /*metadata_snapshot=*/ nullptr);
+        fillMissingColumns(columns, src.rows(), column_names_and_types, column_names_and_types, {}, nullptr);
         assert(std::all_of(columns.begin(), columns.end(), [](const auto & column) { return column != nullptr; }));
 
         return Chunk(std::move(columns), src.rows());
diff --git a/tests/queries/0_stateless/01825_type_json_17.reference b/tests/queries/0_stateless/01825_type_json_17.reference
index 96e58224f32..0f97bfed5bc 100644
--- a/tests/queries/0_stateless/01825_type_json_17.reference
+++ b/tests/queries/0_stateless/01825_type_json_17.reference
@@ -3,5 +3,25 @@ Tuple(arr Nested(k1 Nested(k2 String, k3 String, k4 Int8), k5 Tuple(k6 String)),
 {"obj":{"arr":[{"k1":[{"k2":"","k3":"ddd","k4":10},{"k2":"","k3":"","k4":20}],"k5":{"k6":"foo"}}],"id":2}}
 [['bbb','']]	[['aaa','ccc']]
 [['ddd','']]	[['','']]
+1
 [[0,0]]
 [[10,20]]
+Tuple(arr Nested(k1 Nested(k2 String, k3 Nested(k4 Int8))), id Int8)
+{"obj":{"arr":[{"k1":[{"k2":"aaa","k3":[]}]}],"id":1}}
+{"obj":{"arr":[{"k1":[{"k2":"bbb","k3":[{"k4":10}]},{"k2":"ccc","k3":[{"k4":20}]}]}],"id":2}}
+[['aaa']]	[[[]]]
+[['bbb','ccc']]	[[[10],[20]]]
+1
+[[[]]]
+[[[10],[20]]]
+Tuple(arr Nested(k1 Nested(k2 String, k4 Nested(k5 Int8)), k3 String), id Int8)
+{"obj":{"arr":[{"k1":[],"k3":"qqq"},{"k1":[],"k3":"www"}],"id":1}}
+{"obj":{"arr":[{"k1":[{"k2":"aaa","k4":[]}],"k3":"eee"}],"id":2}}
+{"obj":{"arr":[{"k1":[{"k2":"bbb","k4":[{"k5":10}]},{"k2":"ccc","k4":[{"k5":20}]}],"k3":"rrr"}],"id":3}}
+['qqq','www']	[[],[]]	[[],[]]
+['eee']	[['aaa']]	[[[]]]
+['rrr']	[['bbb','ccc']]	[[[10],[20]]]
+1
+[[],[]]
+[[[]]]
+[[[10],[20]]]
diff --git a/tests/queries/0_stateless/01825_type_json_17.sql b/tests/queries/0_stateless/01825_type_json_17.sql
index b34357f8ef1..e3c0c83322b 100644
--- a/tests/queries/0_stateless/01825_type_json_17.sql
+++ b/tests/queries/0_stateless/01825_type_json_17.sql
@@ -7,12 +7,42 @@ SET output_format_json_named_tuples_as_objects = 1;
 CREATE TABLE t_json_17(obj JSON)
 ENGINE = MergeTree ORDER BY tuple();
 
+DROP FUNCTION IF EXISTS hasValidSizes17;
+CREATE FUNCTION hasValidSizes17 AS (arr1, arr2) -> length(arr1) = length(arr2) AND arrayAll((x, y) -> length(x) = length(y), arr1, arr2);
+
+SYSTEM STOP MERGES t_json_17;
+
 INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]}
 INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k3": "ddd", "k4": 10}, {"k4": 20}], "k5": {"k6": "foo"}}]}
 
 SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
 SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
 SELECT obj.arr.k1.k3, obj.arr.k1.k2 FROM t_json_17 ORDER BY obj.id;
+SELECT sum(hasValidSizes17(obj.arr.k1.k3, obj.arr.k1.k2)) == count() FROM t_json_17;
 SELECT obj.arr.k1.k4 FROM t_json_17 ORDER BY obj.id;
 
-DROP TABLE IF EXISTS t_json_17;
+TRUNCATE TABLE t_json_17;
+
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k1": [{"k2": "aaa"}]}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k2": "bbb", "k3": [{"k4": 10}]}, {"k2": "ccc", "k3": [{"k4": 20}]}]}]}
+
+SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
+SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
+SELECT obj.arr.k1.k2, obj.arr.k1.k3.k4 FROM t_json_17 ORDER BY obj.id;
+SELECT sum(hasValidSizes17(obj.arr.k1.k2, obj.arr.k1.k3.k4)) == count() FROM t_json_17;
+SELECT obj.arr.k1.k3.k4 FROM t_json_17 ORDER BY obj.id;
+
+TRUNCATE TABLE t_json_17;
+
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k3": "qqq"}, {"k3": "www"}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k2": "aaa"}], "k3": "eee"}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 3, "arr": [{"k1": [{"k2": "bbb", "k4": [{"k5": 10}]}, {"k2": "ccc", "k4": [{"k5": 20}]}], "k3": "rrr"}]}
+
+SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
+SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
+SELECT obj.arr.k3, obj.arr.k1.k2, obj.arr.k1.k4.k5 FROM t_json_17 ORDER BY obj.id;
+SELECT sum(hasValidSizes17(obj.arr.k1.k2, obj.arr.k1.k4.k5)) == count() FROM t_json_17;
+SELECT obj.arr.k1.k4.k5 FROM t_json_17 ORDER BY obj.id;
+
+DROP FUNCTION hasValidSizes17;
+DROP TABLE t_json_17;

From ce9b76f41649269cc05a680fbb26f1bf690d520f Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Tue, 6 Sep 2022 15:01:47 +0000
Subject: [PATCH 11/42] fix Nested in in-memory parts

---
 src/Storages/MergeTree/MergeTreeReaderInMemory.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
index 568e1cecf02..06f5785c868 100644
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
@@ -37,8 +37,13 @@ MergeTreeReaderInMemory::MergeTreeReaderInMemory(
         /// If array of Nested column is missing in part,
         /// we have to read its offsets if they exist.
         if (!part_in_memory->block.has(column_to_read.name) && typeid_cast<const DataTypeArray *>(column_to_read.type.get()))
-            if (auto offset_position = findColumnForOffsets(column_to_read))
-                positions_for_offsets[column_to_read.name] = *offset_position;
+        {
+            if (auto offsets_position = findColumnForOffsets(column_to_read))
+            {
+                positions_for_offsets[column_to_read.name] = *offsets_position;
+                partially_read_columns.insert(column_to_read.name);
+            }
+        }
     }
 }
 

From 9def86779508bc4b086b57a1874825ca755e2293 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Tue, 6 Sep 2022 17:38:51 +0000
Subject: [PATCH 12/42] fix reading of subcolumns from in-memory parts

---
 src/Storages/MergeTree/MergeTreeReaderInMemory.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
index 06f5785c868..ad425a10d30 100644
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
@@ -36,7 +36,8 @@ MergeTreeReaderInMemory::MergeTreeReaderInMemory(
     {
         /// If array of Nested column is missing in part,
         /// we have to read its offsets if they exist.
-        if (!part_in_memory->block.has(column_to_read.name) && typeid_cast<const DataTypeArray *>(column_to_read.type.get()))
+        if (typeid_cast<const DataTypeArray *>(column_to_read.type.get())
+            && !tryGetColumnFromBlock(part_in_memory->block, column_to_read))
         {
             if (auto offsets_position = findColumnForOffsets(column_to_read))
             {

From 478d2ad06a6c12e5bf5f87505be382da9844ef48 Mon Sep 17 00:00:00 2001
From: jthmath <jthmath@vip.163.com>
Date: Wed, 7 Sep 2022 10:23:52 +0800
Subject: [PATCH 13/42] fix ignore projection

---
 src/Storages/MergeTree/MutateTask.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index d0b39684ba2..254bcd9f7f9 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -1302,7 +1302,7 @@ private:
                     *ctx->source_part->data_part_storage, it->name(), destination);
                 hardlinked_files.insert(it->name());
             }
-            else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir
+            else if (!endsWith(it->name(), ".tmp_proj")) // ignore projection tmp merge dir
             {
                 // it's a projection part directory
                 ctx->data_part_storage_builder->createProjection(destination);

From 9c847ceec9efd890f594b5b85f5bb29128e7c265 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Wed, 7 Sep 2022 01:58:03 +0200
Subject: [PATCH 14/42] No hardlinks while making backup of MergeTree in atomic
 database.

---
 src/Backups/BackupEntryWrappedWith.h          | 28 ++++++++++++++
 .../MergeTree/DataPartStorageOnDisk.cpp       | 37 +++++++++++++------
 .../MergeTree/DataPartStorageOnDisk.h         |  5 ++-
 src/Storages/MergeTree/IDataPartStorage.h     |  5 ++-
 src/Storages/MergeTree/MergeTreeData.cpp      | 26 +++++++++++--
 src/Storages/MergeTree/MergeTreeData.h        |  2 +-
 .../test_concurrency.py                       |  6 ++-
 7 files changed, 87 insertions(+), 22 deletions(-)
 create mode 100644 src/Backups/BackupEntryWrappedWith.h

diff --git a/src/Backups/BackupEntryWrappedWith.h b/src/Backups/BackupEntryWrappedWith.h
new file mode 100644
index 00000000000..893a88db9fd
--- /dev/null
+++ b/src/Backups/BackupEntryWrappedWith.h
@@ -0,0 +1,28 @@
+#pragma once
+
+
+namespace DB
+{
+
+/// Wraps another backup entry and a value of any type.
+template <typename T>
+class BackupEntryWrappedWith : public IBackupEntry
+{
+public:
+    BackupEntryWrappedWith(BackupEntryPtr entry_, const T & custom_value_) : entry(entry_), custom_value(custom_value_) { }
+    BackupEntryWrappedWith(BackupEntryPtr entry_, T && custom_value_) : entry(entry_), custom_value(std::move(custom_value_)) { }
+    ~BackupEntryWrappedWith() override = default;
+
+    UInt64 getSize() const override { return entry->getSize(); }
+    std::optional<UInt128> getChecksum() const override { return entry->getChecksum(); }
+    std::unique_ptr<SeekableReadBuffer> getReadBuffer() const override { return entry->getReadBuffer(); }
+    String getFilePath() const override { return entry->getFilePath(); }
+    DiskPtr tryGetDiskIfExists() const override { return entry->tryGetDiskIfExists(); }
+    DataSourceDescription getDataSourceDescription() const override { return entry->getDataSourceDescription(); }
+
+private:
+    BackupEntryPtr entry;
+    T custom_value;
+};
+
+}
diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
index 0154fd6e281..894eec12f0c 100644
--- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
+++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
@@ -650,23 +650,31 @@ bool DataPartStorageOnDisk::shallParticipateInMerges(const IStoragePolicy & stor
 }
 
 void DataPartStorageOnDisk::backup(
-    TemporaryFilesOnDisks & temp_dirs,
     const MergeTreeDataPartChecksums & checksums,
     const NameSet & files_without_checksums,
     const String & path_in_backup,
-    BackupEntries & backup_entries) const
+    BackupEntries & backup_entries,
+    bool make_temporary_hard_links,
+    TemporaryFilesOnDisks * temp_dirs) const
 {
     fs::path part_path_on_disk = fs::path{root_path} / part_dir;
     fs::path part_path_in_backup = fs::path{path_in_backup} / part_dir;
 
     auto disk = volume->getDisk();
-    auto temp_dir_it = temp_dirs.find(disk);
-    if (temp_dir_it == temp_dirs.end())
-        temp_dir_it = temp_dirs.emplace(disk, std::make_shared<TemporaryFileOnDisk>(disk, "tmp/")).first;
-    auto temp_dir_owner = temp_dir_it->second;
-    fs::path temp_dir = temp_dir_owner->getPath();
-    fs::path temp_part_dir = temp_dir / part_path_in_backup.relative_path();
-    disk->createDirectories(temp_part_dir);
+
+    fs::path temp_part_dir;
+    std::shared_ptr<TemporaryFileOnDisk> temp_dir_owner;
+    if (make_temporary_hard_links)
+    {
+        assert(temp_dirs);
+        auto temp_dir_it = temp_dirs->find(disk);
+        if (temp_dir_it == temp_dirs->end())
+            temp_dir_it = temp_dirs->emplace(disk, std::make_shared<TemporaryFileOnDisk>(disk, "tmp/")).first;
+        temp_dir_owner = temp_dir_it->second;
+        fs::path temp_dir = temp_dir_owner->getPath();
+        temp_part_dir = temp_dir / part_path_in_backup.relative_path();
+        disk->createDirectories(temp_part_dir);
+    }
 
     /// For example,
     /// part_path_in_backup = /data/test/table/0_1_1_0
@@ -683,13 +691,18 @@ void DataPartStorageOnDisk::backup(
             continue; /// Skip *.proj files - they're actually directories and will be handled.
         String filepath_on_disk = part_path_on_disk / filepath;
         String filepath_in_backup = part_path_in_backup / filepath;
-        String hardlink_filepath = temp_part_dir / filepath;
 
-        disk->createHardLink(filepath_on_disk, hardlink_filepath);
+        if (make_temporary_hard_links)
+        {
+            String hardlink_filepath = temp_part_dir / filepath;
+            disk->createHardLink(filepath_on_disk, hardlink_filepath);
+            filepath_on_disk = hardlink_filepath;
+        }
+
         UInt128 file_hash{checksum.file_hash.first, checksum.file_hash.second};
         backup_entries.emplace_back(
             filepath_in_backup,
-            std::make_unique<BackupEntryFromImmutableFile>(disk, hardlink_filepath, checksum.file_size, file_hash, temp_dir_owner));
+            std::make_unique<BackupEntryFromImmutableFile>(disk, filepath_on_disk, checksum.file_size, file_hash, temp_dir_owner));
     }
 
     for (const auto & filepath : files_without_checksums)
diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.h b/src/Storages/MergeTree/DataPartStorageOnDisk.h
index 5b5cff8e636..f02ef26f811 100644
--- a/src/Storages/MergeTree/DataPartStorageOnDisk.h
+++ b/src/Storages/MergeTree/DataPartStorageOnDisk.h
@@ -89,11 +89,12 @@ public:
     bool shallParticipateInMerges(const IStoragePolicy &) const override;
 
     void backup(
-        TemporaryFilesOnDisks & temp_dirs,
         const MergeTreeDataPartChecksums & checksums,
         const NameSet & files_without_checksums,
         const String & path_in_backup,
-        BackupEntries & backup_entries) const override;
+        BackupEntries & backup_entries,
+        bool make_temporary_hard_links,
+        TemporaryFilesOnDisks * temp_dirs) const override;
 
     DataPartStoragePtr freeze(
         const std::string & to,
diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h
index 946c1c5fd47..9da8a5eae03 100644
--- a/src/Storages/MergeTree/IDataPartStorage.h
+++ b/src/Storages/MergeTree/IDataPartStorage.h
@@ -177,11 +177,12 @@ public:
     /// Also creates a new tmp_dir for internal disk (if disk is mentioned the first time).
     using TemporaryFilesOnDisks = std::map<DiskPtr, std::shared_ptr<TemporaryFileOnDisk>>;
     virtual void backup(
-        TemporaryFilesOnDisks & temp_dirs,
         const MergeTreeDataPartChecksums & checksums,
         const NameSet & files_without_checksums,
         const String & path_in_backup,
-        BackupEntries & backup_entries) const = 0;
+        BackupEntries & backup_entries,
+        bool make_temporary_hard_links,
+        TemporaryFilesOnDisks * temp_dirs) const = 0;
 
     /// Creates hardlinks into 'to/dir_path' for every file in data part.
     /// Callback is called after hardlinks are created, but before 'delete-on-destroy.txt' marker is removed.
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index b7b68367e98..8b077e6fb13 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -3,6 +3,7 @@
 #include <Backups/BackupEntriesCollector.h>
 #include <Backups/BackupEntryFromImmutableFile.h>
 #include <Backups/BackupEntryFromSmallFile.h>
+#include <Backups/BackupEntryWrappedWith.h>
 #include <Backups/IBackup.h>
 #include <Backups/RestorerFromBackup.h>
 #include <Compression/CompressedReadBuffer.h>
@@ -4110,26 +4111,43 @@ void MergeTreeData::backupData(BackupEntriesCollector & backup_entries_collector
     backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup));
 }
 
-BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup)
+BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup) const
 {
     BackupEntries backup_entries;
     std::map<DiskPtr, std::shared_ptr<TemporaryFileOnDisk>> temp_dirs;
 
+    /// Tables in atomic databases have UUID. When using atomic database we don't have to create hard links to make a backup, we can just
+    /// keep smart pointers to data parts instead. That's because the files of a data part are removed only by the destructor of the data part
+    /// and so keeping a smart pointer to a data part is enough to protect those files from deleting.
+    bool use_hard_links = !getStorageID().hasUUID();
+
     for (const auto & part : data_parts)
     {
+        BackupEntries new_backup_entries;
+
         part->data_part_storage->backup(
-            temp_dirs, part->checksums, part->getFileNamesWithoutChecksums(), data_path_in_backup, backup_entries);
+            part->checksums, part->getFileNamesWithoutChecksums(), data_path_in_backup, new_backup_entries, use_hard_links, &temp_dirs);
 
         auto projection_parts = part->getProjectionParts();
         for (const auto & [projection_name, projection_part] : projection_parts)
         {
             projection_part->data_part_storage->backup(
-                temp_dirs,
                 projection_part->checksums,
                 projection_part->getFileNamesWithoutChecksums(),
                 fs::path{data_path_in_backup} / part->name,
-                backup_entries);
+                new_backup_entries,
+                use_hard_links,
+                &temp_dirs);
         }
+
+        if (!use_hard_links)
+        {
+            /// Wrap backup entries with data parts in order to keep the data parts alive while the backup entries in use.
+            for (auto & [_, backup_entry] : new_backup_entries)
+                backup_entry = std::make_shared<BackupEntryWrappedWith<DataPartPtr>>(std::move(backup_entry), part);
+        }
+
+        insertAtEnd(backup_entries, std::move(new_backup_entries));
     }
 
     return backup_entries;
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index c91c7ba02a8..93f9e6157d8 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -1243,7 +1243,7 @@ protected:
     bool movePartsToSpace(const DataPartsVector & parts, SpacePtr space);
 
     /// Makes backup entries to backup the parts of this table.
-    static BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup);
+    BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup) const;
 
     class RestoredPartsHolder;
 
diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
index 2269ccda828..315c6b94507 100644
--- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
@@ -175,7 +175,11 @@ def test_concurrent_backups_on_different_nodes():
 
 @pytest.mark.parametrize(
     "db_engine, table_engine",
-    [("Replicated", "ReplicatedMergeTree"), ("Ordinary", "MergeTree")],
+    [
+        ("Ordinary", "MergeTree"),
+        ("Atomic", "MergeTree"),
+        ("Replicated", "ReplicatedMergeTree"),
+    ],
 )
 def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     if db_engine == "Replicated":

From 0cc6202706f5d5f4fe84341b717da33fc9b18cdb Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Mon, 5 Sep 2022 15:07:11 +0200
Subject: [PATCH 15/42] Add macos builds to ReleaseBranchCI

---
 .github/workflows/release_branches.yml | 134 +++++++++++++++++++++++++
 1 file changed, 134 insertions(+)

diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml
index 6403d00157f..1680798060c 100644
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@@ -426,6 +426,100 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwin:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwinAarch64:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin_aarch64
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@@ -505,6 +599,46 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH"
+  BuilderSpecialReport:
+    needs:
+      - BuilderBinDarwin
+      - BuilderBinDarwinAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/report_check
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=ClickHouse special build check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Report Builder
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 build_report_check.py "$CHECK_NAME"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
 ##############################################################################################
 ########################### FUNCTIONAl STATELESS TESTS #######################################
 ##############################################################################################

From f3cf1069dc51457fc8e749b36b8a2e205f5f54b0 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Mon, 5 Sep 2022 15:59:20 +0200
Subject: [PATCH 16/42] Fix header for autogenerated version

---
 tests/ci/version_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci/version_helper.py b/tests/ci/version_helper.py
index de98b8431de..966858c0747 100755
--- a/tests/ci/version_helper.py
+++ b/tests/ci/version_helper.py
@@ -20,7 +20,7 @@ const char * auto_contributors[] {{
 
 VERSIONS = Dict[str, Union[int, str]]
 
-VERSIONS_TEMPLATE = """# This variables autochanged by release_lib.sh:
+VERSIONS_TEMPLATE = """# This variables autochanged by tests/ci/version_helper.py:
 
 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.

From 257c4328f7b4f2a09d40d7c6c05a4170aba7ba28 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Wed, 7 Sep 2022 15:06:44 +0200
Subject: [PATCH 17/42] Rename get_build_urls to read_build_urls

---
 tests/ci/ast_fuzzer_check.py      | 4 ++--
 tests/ci/build_download_helper.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py
index 9f3ddbe9932..f3939dc89ad 100644
--- a/tests/ci/ast_fuzzer_check.py
+++ b/tests/ci/ast_fuzzer_check.py
@@ -17,7 +17,7 @@ from env_helper import (
 from s3_helper import S3Helper
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
-from build_download_helper import get_build_name_for_check, get_build_urls
+from build_download_helper import get_build_name_for_check, read_build_urls
 from docker_pull_helper import get_image_with_version
 from commit_status_helper import post_commit_status
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
@@ -69,7 +69,7 @@ if __name__ == "__main__":
 
     build_name = get_build_name_for_check(check_name)
     print(build_name)
-    urls = get_build_urls(build_name, reports_path)
+    urls = read_build_urls(build_name, reports_path)
     if not urls:
         raise Exception("No build URLs found")
 
diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py
index f5eb72dddee..fa6097cd680 100644
--- a/tests/ci/build_download_helper.py
+++ b/tests/ci/build_download_helper.py
@@ -45,7 +45,7 @@ def get_build_name_for_check(check_name):
     return CI_CONFIG["tests_config"][check_name]["required_build"]
 
 
-def get_build_urls(build_name, reports_path):
+def read_build_urls(build_name, reports_path):
     for root, _, files in os.walk(reports_path):
         for f in files:
             if build_name in f:
@@ -111,7 +111,7 @@ def download_builds_filter(
     check_name, reports_path, result_path, filter_fn=lambda _: True
 ):
     build_name = get_build_name_for_check(check_name)
-    urls = get_build_urls(build_name, reports_path)
+    urls = read_build_urls(build_name, reports_path)
     print(urls)
 
     if not urls:

From 7cdd8c3304155cca20699a6d69cb7498efff6c4f Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Wed, 7 Sep 2022 17:10:58 +0200
Subject: [PATCH 18/42] Move download template to env_helper

---
 tests/ci/env_helper.py          |  8 ++++++--
 tests/ci/push_to_artifactory.py | 21 +++++----------------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py
index 12c21398781..a18f47497fd 100644
--- a/tests/ci/env_helper.py
+++ b/tests/ci/env_helper.py
@@ -22,10 +22,14 @@ IMAGES_PATH = os.getenv("IMAGES_PATH", TEMP_PATH)
 REPORTS_PATH = os.getenv("REPORTS_PATH", p.abspath(p.join(module_dir, "./reports")))
 REPO_COPY = os.getenv("REPO_COPY", git_root)
 RUNNER_TEMP = os.getenv("RUNNER_TEMP", p.abspath(p.join(module_dir, "./tmp")))
-S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com")
-S3_DOWNLOAD = os.getenv("S3_DOWNLOAD", S3_URL)
 S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "clickhouse-builds")
 S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "clickhouse-test-reports")
+S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com")
+S3_DOWNLOAD = os.getenv("S3_DOWNLOAD", S3_URL)
+S3_ARTIFACT_DOWNLOAD_TEMPLATE = (
+    f"{S3_DOWNLOAD}/{S3_BUILDS_BUCKET}/"
+    "{pr_or_release}/{commit}/{build_name}/{artifact}"
+)
 
 # These parameters are set only on demand, and only once
 _GITHUB_JOB_ID = ""
diff --git a/tests/ci/push_to_artifactory.py b/tests/ci/push_to_artifactory.py
index 6b407eb5bd8..c472bcd6b4a 100755
--- a/tests/ci/push_to_artifactory.py
+++ b/tests/ci/push_to_artifactory.py
@@ -9,7 +9,7 @@ from typing import Dict, List, Tuple
 
 from artifactory import ArtifactorySaaSPath  # type: ignore
 from build_download_helper import dowload_build_with_progress
-from env_helper import RUNNER_TEMP, S3_BUILDS_BUCKET, S3_DOWNLOAD
+from env_helper import S3_ARTIFACT_DOWNLOAD_TEMPLATE, RUNNER_TEMP
 from git_helper import TAG_REGEXP, commit, removeprefix, removesuffix
 
 
@@ -97,18 +97,6 @@ class Packages:
 
 
 class S3:
-    template = (
-        f"{S3_DOWNLOAD}/"
-        # "clickhouse-builds/"
-        f"{S3_BUILDS_BUCKET}/"
-        # "33333/" or "21.11/" from --release, if pull request is omitted
-        "{pr}/"
-        # "2bef313f75e4cacc6ea2ef2133e8849ecf0385ec/"
-        "{commit}/"
-        # "package_release/clickhouse-common-static_21.11.5.0_amd64.deb"
-        "{s3_path_suffix}"
-    )
-
     def __init__(
         self,
         pr: int,
@@ -117,7 +105,7 @@ class S3:
         force_download: bool,
     ):
         self._common = dict(
-            pr=pr,
+            pr_or_release=pr,
             commit=commit,
         )
         self.force_download = force_download
@@ -133,8 +121,9 @@ class S3:
                 self.packages.replace_with_fallback(package_file)
 
             return
-        url = self.template.format_map(
-            {**self._common, "s3_path_suffix": s3_path_suffix}
+        build_name, artifact = s3_path_suffix.split("/")
+        url = S3_ARTIFACT_DOWNLOAD_TEMPLATE.format_map(
+            {**self._common, "build_name": build_name, "artifact": artifact}
         )
         try:
             dowload_build_with_progress(url, path)

From bd83e905d3bf303660fded6a6a15813b37cdb464 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Wed, 7 Sep 2022 18:47:47 +0200
Subject: [PATCH 19/42] Fix a typo in download_build_with_progress

---
 tests/ci/build_download_helper.py | 4 ++--
 tests/ci/push_to_artifactory.py   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py
index fa6097cd680..04d2cda2465 100644
--- a/tests/ci/build_download_helper.py
+++ b/tests/ci/build_download_helper.py
@@ -56,7 +56,7 @@ def read_build_urls(build_name, reports_path):
     return []
 
 
-def dowload_build_with_progress(url, path):
+def download_build_with_progress(url, path):
     logging.info("Downloading from %s to temp path %s", url, path)
     for i in range(DOWNLOAD_RETRIES_COUNT):
         try:
@@ -104,7 +104,7 @@ def download_builds(result_path, build_urls, filter_fn):
         if filter_fn(url):
             fname = os.path.basename(url.replace("%2B", "+").replace("%20", " "))
             logging.info("Will download %s to %s", fname, result_path)
-            dowload_build_with_progress(url, os.path.join(result_path, fname))
+            download_build_with_progress(url, os.path.join(result_path, fname))
 
 
 def download_builds_filter(
diff --git a/tests/ci/push_to_artifactory.py b/tests/ci/push_to_artifactory.py
index c472bcd6b4a..dd8081227bf 100755
--- a/tests/ci/push_to_artifactory.py
+++ b/tests/ci/push_to_artifactory.py
@@ -8,7 +8,7 @@ from collections import namedtuple
 from typing import Dict, List, Tuple
 
 from artifactory import ArtifactorySaaSPath  # type: ignore
-from build_download_helper import dowload_build_with_progress
+from build_download_helper import download_build_with_progress
 from env_helper import S3_ARTIFACT_DOWNLOAD_TEMPLATE, RUNNER_TEMP
 from git_helper import TAG_REGEXP, commit, removeprefix, removesuffix
 
@@ -126,14 +126,14 @@ class S3:
             {**self._common, "build_name": build_name, "artifact": artifact}
         )
         try:
-            dowload_build_with_progress(url, path)
+            download_build_with_progress(url, path)
         except Exception as e:
             if "Cannot download dataset from" in e.args[0]:
                 new_url = Packages.fallback_to_all(url)
                 logging.warning(
                     "Fallback downloading %s for old release", fallback_path
                 )
-                dowload_build_with_progress(new_url, fallback_path)
+                download_build_with_progress(new_url, fallback_path)
                 self.packages.replace_with_fallback(package_file)
 
     def download_deb(self):

From a067907fbe0934c18de20a73f48f7c975bcb2006 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Wed, 7 Sep 2022 19:20:22 +0200
Subject: [PATCH 20/42] Add typing and order import

---
 tests/ci/build_download_helper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py
index 04d2cda2465..58997bed253 100644
--- a/tests/ci/build_download_helper.py
+++ b/tests/ci/build_download_helper.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
 
-import os
 import json
 import logging
+import os
 import sys
 import time
-from typing import Optional
+from typing import List, Optional
 
 import requests  # type: ignore
 
@@ -41,11 +41,11 @@ def get_with_retries(
     return response
 
 
-def get_build_name_for_check(check_name):
+def get_build_name_for_check(check_name) -> str:
     return CI_CONFIG["tests_config"][check_name]["required_build"]
 
 
-def read_build_urls(build_name, reports_path):
+def read_build_urls(build_name, reports_path) -> List[str]:
     for root, _, files in os.walk(reports_path):
         for f in files:
             if build_name in f:

From c0ce4c2d6fa19b1515f964001ff2cbaf6af13b84 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Wed, 7 Sep 2022 19:20:58 +0200
Subject: [PATCH 21/42] Add macos binaries to GH release assets

---
 .github/workflows/release.yml |  4 ++
 tests/ci/download_binary.py   | 79 +++++++++++++++++++++++++++++++++++
 2 files changed, 83 insertions(+)
 create mode 100755 tests/ci/download_binary.py

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 7e12695990c..ae905aa62ba 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -29,8 +29,12 @@ jobs:
         rm -rf "$TEMP_PATH" && mkdir -p "$TEMP_PATH"
         cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
         cd "$REPO_COPY"
+        # Download and push packages to artifactory
         python3 ./tests/ci/push_to_artifactory.py --release "${{ github.ref }}" \
           --commit '${{ github.sha }}' --artifactory-url "${{ secrets.JFROG_ARTIFACTORY_URL }}" --all
+        # Download macos binaries to ${{runner.temp}}/download_binary
+        python3 ./tests/ci/download_binary.py binary_darwin binary_darwin_aarch64
+        mv '${{runner.temp}}/download_binary/'clickhouse-* '${{runner.temp}}/push_to_artifactory'
     - name: Upload packages to release assets
       uses: svenstaro/upload-release-action@v2
       with:
diff --git a/tests/ci/download_binary.py b/tests/ci/download_binary.py
new file mode 100755
index 00000000000..b95c86aa0bd
--- /dev/null
+++ b/tests/ci/download_binary.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+"""
+This file is needed to avoid cicle import build_download_helper.py <=> env_helper.py
+"""
+
+import argparse
+import logging
+import os
+
+from build_download_helper import download_build_with_progress
+from ci_config import CI_CONFIG, BuildConfig
+from env_helper import RUNNER_TEMP, S3_ARTIFACT_DOWNLOAD_TEMPLATE
+from git_helper import Git, commit
+from version_helper import get_version_from_repo, version_arg
+
+TEMP_PATH = os.path.join(RUNNER_TEMP, "download_binary")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Script to download binary artifacts from S3. Downloaded artifacts "
+        "are renamed to clickhouse-{static_binary_name}",
+    )
+    parser.add_argument(
+        "--version",
+        type=version_arg,
+        default=get_version_from_repo().string,
+        help="a version to generate a download url, get from the repo by default",
+    )
+    parser.add_argument(
+        "--commit",
+        type=commit,
+        default=Git(True).sha,
+        help="a version to generate a download url, get from the repo by default",
+    )
+    parser.add_argument("--rename", default=True, help=argparse.SUPPRESS)
+    parser.add_argument(
+        "--no-rename",
+        dest="rename",
+        action="store_false",
+        default=argparse.SUPPRESS,
+        help="if set, the downloaded binary won't be renamed to "
+        "clickhouse-{static_binary_name}, makes sense only for a single build name",
+    )
+    parser.add_argument(
+        "build_names",
+        nargs="+",
+        help="the build names to download",
+    )
+    args = parser.parse_args()
+    if not args.rename and len(args.build_names) > 1:
+        parser.error("`--no-rename` shouldn't be used with more than one build name")
+    return args
+
+
+def main():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
+    args = parse_args()
+    os.makedirs(TEMP_PATH, exist_ok=True)
+    for build in args.build_names:
+        # check if it's in CI_CONFIG
+        config = CI_CONFIG["build_config"][build]  # type: BuildConfig
+        if args.rename:
+            path = os.path.join(TEMP_PATH, f"clickhouse-{config['static_binary_name']}")
+        else:
+            path = os.path.join(TEMP_PATH, "clickhouse")
+
+        url = S3_ARTIFACT_DOWNLOAD_TEMPLATE.format(
+            pr_or_release=f"{args.version.major}.{args.version.minor}",
+            commit=args.commit,
+            build_name=build,
+            artifact="clickhouse",
+        )
+        download_build_with_progress(url, path)
+
+
+if __name__ == "__main__":
+    main()

From e74bb00e90fbe2f8c891b7adf43e86c1b6a21761 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <igor@clickhouse.com>
Date: Wed, 7 Sep 2022 19:11:25 +0000
Subject: [PATCH 22/42] Fix: EXPLAIN PLAN - make settings in SETTING clause
 effective

---
 src/Interpreters/IInterpreterUnionOrSelectQuery.h | 2 ++
 src/Interpreters/InterpreterExplainQuery.cpp      | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/IInterpreterUnionOrSelectQuery.h b/src/Interpreters/IInterpreterUnionOrSelectQuery.h
index a1c86f9de85..6f893d4703e 100644
--- a/src/Interpreters/IInterpreterUnionOrSelectQuery.h
+++ b/src/Interpreters/IInterpreterUnionOrSelectQuery.h
@@ -58,6 +58,8 @@ public:
     /// Add limits from external query.
     void addStorageLimits(const StorageLimitsList & limits);
 
+    ContextPtr getContext() const { return context; }
+
 protected:
     ASTPtr query_ptr;
     ContextMutablePtr context;
diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp
index 4799970b6a1..746d382198d 100644
--- a/src/Interpreters/InterpreterExplainQuery.cpp
+++ b/src/Interpreters/InterpreterExplainQuery.cpp
@@ -316,7 +316,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
             interpreter.buildQueryPlan(plan);
 
             if (settings.optimize)
-                plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext()));
+                plan.optimize(QueryPlanOptimizationSettings::fromContext(interpreter.getContext()));
 
             if (settings.json)
             {
@@ -326,7 +326,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
                 auto plan_array = std::make_unique<JSONBuilder::JSONArray>();
                 plan_array->add(std::move(plan_map));
 
-                auto format_settings = getFormatSettings(getContext());
+                auto format_settings = getFormatSettings(interpreter.getContext());
                 format_settings.json.quote_64bit_integers = false;
 
                 JSONBuilder::FormatSettings json_format_settings{.settings = format_settings};

From 5a790b15b46b304a57779a4c197788ee4bbf27ce Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Thu, 8 Sep 2022 00:20:11 +0000
Subject: [PATCH 23/42] try to fix filling of missed Nested columns with
 multiple levels

---
 src/Interpreters/inplaceBlockConversions.cpp | 23 ++++++++------------
 src/Storages/MergeTree/IMergeTreeReader.cpp  |  6 ++++-
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp
index 116cd91c7cf..a4791690f4e 100644
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@@ -200,23 +200,19 @@ static std::unordered_map<String, ColumnPtr> collectOffsetsColumns(
             continue;
 
         auto serialization = IDataType::getSerialization(*available_column);
-        auto name_in_storage = Nested::extractTableName(available_column->name);
-
         serialization->enumerateStreams([&](const auto & subpath)
         {
             if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
                 return;
 
+            auto stream_name = ISerialization::getFileNameForStream(*available_column, subpath);
             const auto & current_offsets_column = subpath.back().data.column;
 
             /// If for some reason multiple offsets columns are present
             /// for the same nested data structure, choose the one that is not empty.
             if (current_offsets_column && !current_offsets_column->empty())
             {
-                auto subname = ISerialization::getSubcolumnNameForStream(subpath);
-                auto full_name = Nested::concatenateName(name_in_storage, subname);
-                auto & offsets_column = offsets_columns[full_name];
-
+                auto & offsets_column = offsets_columns[stream_name];
                 if (!offsets_column)
                     offsets_column = current_offsets_column;
 
@@ -227,7 +223,7 @@ static std::unordered_map<String, ColumnPtr> collectOffsetsColumns(
                 if (offsets_data != current_offsets_data)
                     throw Exception(ErrorCodes::LOGICAL_ERROR,
                         "Found non-equal columns with offsets (sizes: {} and {}) for stream {}",
-                        offsets_data.size(), current_offsets_data.size(), full_name);
+                        offsets_data.size(), current_offsets_data.size(), stream_name);
             #endif
             }
         }, available_column->type, res_columns[i]);
@@ -255,7 +251,7 @@ void fillMissingColumns(
     /// but a column of arrays of correct length.
 
     /// First, collect offset columns for all arrays in the block.
-    auto offset_columns = collectOffsetsColumns(available_columns, res_columns);
+    auto offsets_columns = collectOffsetsColumns(available_columns, res_columns);
 
     /// Insert default values only for columns without default expressions.
     auto requested_column = requested_columns.begin();
@@ -275,14 +271,13 @@ void fillMissingColumns(
         std::vector<ColumnPtr> current_offsets;
         size_t num_dimensions = 0;
 
-        if (const auto * array_type = typeid_cast<const DataTypeArray *>(type.get()))
+        const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
+        if (array_type && !offsets_columns.empty())
         {
             num_dimensions = getNumberOfDimensions(*array_type);
             current_offsets.resize(num_dimensions);
 
             auto serialization = IDataType::getSerialization(*requested_column);
-            auto name_in_storage = Nested::extractTableName(requested_column->name);
-
             serialization->enumerateStreams([&](const auto & subpath)
             {
                 if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
@@ -291,9 +286,9 @@ void fillMissingColumns(
                 size_t level = ISerialization::getArrayLevel(subpath);
                 assert(level < num_dimensions);
 
-                auto subname = ISerialization::getSubcolumnNameForStream(subpath);
-                auto it = offset_columns.find(Nested::concatenateName(name_in_storage, subname));
-                if (it != offset_columns.end())
+                auto stream_name = ISerialization::getFileNameForStream(*requested_column, subpath);
+                auto it = offsets_columns.find(stream_name);
+                if (it != offsets_columns.end())
                     current_offsets[level] = it->second;
             });
 
diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp
index 73f9cc8b75d..521de3515cf 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@@ -65,7 +65,11 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
     try
     {
         NamesAndTypesList available_columns(columns_to_read.begin(), columns_to_read.end());
-        DB::fillMissingColumns(res_columns, num_rows, requested_columns, available_columns, partially_read_columns, metadata_snapshot);
+        DB::fillMissingColumns(
+            res_columns, num_rows,
+            Nested::convertToSubcolumns(requested_columns),
+            Nested::convertToSubcolumns(available_columns),
+            partially_read_columns, metadata_snapshot);
 
         should_evaluate_missing_defaults = std::any_of(
             res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; });

From 09c0bf29310ec71626d039173d75467d28a77c7a Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Thu, 8 Sep 2022 08:16:38 +0000
Subject: [PATCH 24/42] Add unit tests for match path

---
 src/Common/ZooKeeper/ZooKeeperCommon.cpp       | 14 +++++++++++++-
 src/Common/ZooKeeper/tests/gtest_zookeeper.cpp | 15 +++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 src/Common/ZooKeeper/tests/gtest_zookeeper.cpp

diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index 749052cbba3..3aba0a0651d 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -898,10 +898,22 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
     registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
 }
 
-PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
+PathMatchResult matchPath(std::string_view path, std::string_view match_to)
 {
     using enum PathMatchResult;
 
+    if (match_to == "/")
+        return path == "/" ? EXACT : IS_CHILD;
+
+    const auto clean_path = [](auto & p)
+    {
+        if (p.ends_with('/'))
+            p.remove_suffix(1);
+    };
+
+    clean_path(path);
+    clean_path(match_to);
+
     auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
 
     if (second_it != match_to.end())
diff --git a/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp b/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
new file mode 100644
index 00000000000..5a989e5932f
--- /dev/null
+++ b/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
@@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+
+TEST(ZooKeeperTest, TestMatchPath)
+{
+    using namespace Coordination;
+
+    ASSERT_EQ(matchPath("/path/file", "/path"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/path/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/", "/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path", "/path/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path/", "/path"), PathMatchResult::EXACT);
+}

From bd10a2195b9730a4131a5c8624804d2a6856bf91 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <felixoid@clickhouse.com>
Date: Thu, 8 Sep 2022 10:33:24 +0200
Subject: [PATCH 25/42] Build macos binaries in backport CI, add
 BuilderSpecialReport to Finish

---
 .github/workflows/backport_branches.yml | 135 ++++++++++++++++++++++++
 .github/workflows/release_branches.yml  |   1 +
 2 files changed, 136 insertions(+)

diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml
index a1086452184..1c51d06f395 100644
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@@ -349,6 +349,100 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwin:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwinAarch64:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin_aarch64
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@@ -425,6 +519,46 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH"
+  BuilderSpecialReport:
+    needs:
+      - BuilderBinDarwin
+      - BuilderBinDarwinAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/report_check
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=ClickHouse special build check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Report Builder
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 build_report_check.py "$CHECK_NAME"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
 ##############################################################################################
 ########################### FUNCTIONAl STATELESS TESTS #######################################
 ##############################################################################################
@@ -592,6 +726,7 @@ jobs:
       - DockerHubPush
       - DockerServerImages
       - BuilderReport
+      - BuilderSpecialReport
       - FunctionalStatelessTestAsan
       - FunctionalStatefulTestDebug
       - StressTestTsan
diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml
index 1680798060c..f579d1fee63 100644
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@@ -1981,6 +1981,7 @@ jobs:
       - DockerHubPush
       - DockerServerImages
       - BuilderReport
+      - BuilderSpecialReport
       - FunctionalStatelessTestDebug0
       - FunctionalStatelessTestDebug1
       - FunctionalStatelessTestDebug2

From 122009a2bde416447ef6cbb98ea36c95b47f2b31 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Thu, 8 Sep 2022 08:29:31 +0200
Subject: [PATCH 26/42] Use table lock if database is ordinary and
 zero-copy-replication is enabled.

---
 src/Backups/BackupEntryWrappedWith.h          |  9 +++
 src/Storages/MergeTree/MergeTreeData.cpp      | 60 ++++++++++++++-----
 src/Storages/MergeTree/MergeTreeData.h        |  2 +-
 src/Storages/StorageMergeTree.cpp             |  2 +-
 src/Storages/StorageReplicatedMergeTree.cpp   |  2 +-
 .../test_concurrency.py                       |  9 ++-
 6 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/src/Backups/BackupEntryWrappedWith.h b/src/Backups/BackupEntryWrappedWith.h
index 893a88db9fd..97244650b6b 100644
--- a/src/Backups/BackupEntryWrappedWith.h
+++ b/src/Backups/BackupEntryWrappedWith.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <Backups/IBackupEntry.h>
+
 
 namespace DB
 {
@@ -25,4 +27,11 @@ private:
     T custom_value;
 };
 
+template <typename T>
+void wrapBackupEntriesWith(std::vector<std::pair<String, BackupEntryPtr>> & backup_entries, const T & custom_value)
+{
+    for (auto & [_, backup_entry] : backup_entries)
+        backup_entry = std::make_shared<BackupEntryWrappedWith<T>>(std::move(backup_entry), custom_value);
+}
+
 }
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 8b077e6fb13..b9deeeaffb7 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -4108,25 +4108,49 @@ void MergeTreeData::backupData(BackupEntriesCollector & backup_entries_collector
     else
         data_parts = getVisibleDataPartsVector(local_context);
 
-    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup));
+    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup, local_context));
 }
 
-BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup) const
+BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup, const ContextPtr & local_context)
 {
     BackupEntries backup_entries;
     std::map<DiskPtr, std::shared_ptr<TemporaryFileOnDisk>> temp_dirs;
-
-    /// Tables in atomic databases have UUID. When using atomic database we don't have to create hard links to make a backup, we can just
-    /// keep smart pointers to data parts instead. That's because the files of a data part are removed only by the destructor of the data part
-    /// and so keeping a smart pointer to a data part is enough to protect those files from deleting.
-    bool use_hard_links = !getStorageID().hasUUID();
+    TableLockHolder table_lock;
 
     for (const auto & part : data_parts)
     {
-        BackupEntries new_backup_entries;
+        /// Hard links is the default way to ensure that we'll be keeping access to the files of parts.
+        bool make_temporary_hard_links = true;
+        bool hold_storage_and_part_ptrs = false;
+        bool hold_table_lock = false;
 
+        if (getStorageID().hasUUID())
+        {
+            /// Tables in atomic databases have UUIDs. When using atomic database we don't have to create hard links to make a backup,
+            /// we can just hold smart pointers to a storage and to data parts instead. That's enough to protect those files from deleting
+            /// until the backup is done (see the calls `part.unique()` in grabOldParts() and table.unique() in DatabaseCatalog).
+            make_temporary_hard_links = false;
+            hold_storage_and_part_ptrs = true;
+        }
+        else if (supportsReplication() && part->data_part_storage->supportZeroCopyReplication() && getSettings()->allow_remote_fs_zero_copy_replication)
+        {
+            /// Hard links don't work correctly with zero copy replication.
+            make_temporary_hard_links = false;
+            hold_storage_and_part_ptrs = true;
+            hold_table_lock = true;
+        }
+
+        if (hold_table_lock && !table_lock)
+            table_lock = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
+
+        BackupEntries backup_entries_from_part;
         part->data_part_storage->backup(
-            part->checksums, part->getFileNamesWithoutChecksums(), data_path_in_backup, new_backup_entries, use_hard_links, &temp_dirs);
+            part->checksums,
+            part->getFileNamesWithoutChecksums(),
+            data_path_in_backup,
+            backup_entries_from_part,
+            make_temporary_hard_links,
+            &temp_dirs);
 
         auto projection_parts = part->getProjectionParts();
         for (const auto & [projection_name, projection_part] : projection_parts)
@@ -4135,19 +4159,23 @@ BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, con
                 projection_part->checksums,
                 projection_part->getFileNamesWithoutChecksums(),
                 fs::path{data_path_in_backup} / part->name,
-                new_backup_entries,
-                use_hard_links,
+                backup_entries_from_part,
+                make_temporary_hard_links,
                 &temp_dirs);
         }
 
-        if (!use_hard_links)
+        if (hold_storage_and_part_ptrs)
         {
-            /// Wrap backup entries with data parts in order to keep the data parts alive while the backup entries in use.
-            for (auto & [_, backup_entry] : new_backup_entries)
-                backup_entry = std::make_shared<BackupEntryWrappedWith<DataPartPtr>>(std::move(backup_entry), part);
+            /// Wrap backup entries with smart pointers to data parts and to the storage itself
+            /// (we'll be holding those smart pointers for as long as we'll be using the backup entries).
+            auto storage_and_part = std::make_pair(shared_from_this(), part);
+            if (hold_table_lock)
+                wrapBackupEntriesWith(backup_entries_from_part, std::make_pair(storage_and_part, table_lock));
+            else
+                wrapBackupEntriesWith(backup_entries_from_part, storage_and_part);
         }
 
-        insertAtEnd(backup_entries, std::move(new_backup_entries));
+        insertAtEnd(backup_entries, std::move(backup_entries_from_part));
     }
 
     return backup_entries;
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 93f9e6157d8..824c2b41f00 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -1243,7 +1243,7 @@ protected:
     bool movePartsToSpace(const DataPartsVector & parts, SpacePtr space);
 
     /// Makes backup entries to backup the parts of this table.
-    BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup) const;
+    BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup, const ContextPtr & local_context);
 
     class RestoredPartsHolder;
 
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 66e570fdc3b..5adc1974257 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -1785,7 +1785,7 @@ void StorageMergeTree::backupData(BackupEntriesCollector & backup_entries_collec
     for (const auto & data_part : data_parts)
         min_data_version = std::min(min_data_version, data_part->info.getDataVersion());
 
-    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup));
+    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup, local_context));
     backup_entries_collector.addBackupEntries(backupMutations(min_data_version + 1, data_path_in_backup));
 }
 
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 4be97e01293..d662682d228 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -8264,7 +8264,7 @@ void StorageReplicatedMergeTree::backupData(
     else
         data_parts = getVisibleDataPartsVector(local_context);
 
-    auto backup_entries = backupParts(data_parts, "");
+    auto backup_entries = backupParts(data_parts, "", local_context);
 
     auto coordination = backup_entries_collector.getBackupCoordination();
     String shared_id = getTableSharedID();
diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
index 315c6b94507..fa6b09a89cf 100644
--- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
@@ -29,7 +29,6 @@ def generate_cluster_def():
 
 
 main_configs = ["configs/backups_disk.xml", generate_cluster_def()]
-
 user_configs = ["configs/allow_database_types.xml"]
 
 nodes = []
@@ -184,6 +183,7 @@ def test_concurrent_backups_on_different_nodes():
 def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     if db_engine == "Replicated":
         db_engine = "Replicated('/clickhouse/path/','{shard}','{replica}')"
+
     if table_engine.endswith("MergeTree"):
         table_engine += " ORDER BY tuple()"
 
@@ -219,6 +219,12 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
                 f"RENAME TABLE {table_name1} TO {table_name2}"
             )
 
+    def truncate_table():
+        while time.time() < end_time:
+            table_name = f"mydb.tbl{randint(1, num_nodes)}"
+            node = nodes[randint(0, num_nodes - 1)]
+            node.query(f"TRUNCATE TABLE IF EXISTS {table_name} NO DELAY")
+
     def make_backup():
         ids = []
         while time.time() < end_time:
@@ -240,6 +246,7 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
         futures.append(executor.submit(create_table))
         futures.append(executor.submit(drop_table))
         futures.append(executor.submit(rename_table))
+        futures.append(executor.submit(truncate_table))
         for future in futures:
             future.result()
         ids = ids_future.result()

From fab1b40928ed0ffe6b2e0de120f3612d7e572b29 Mon Sep 17 00:00:00 2001
From: Vincent Bernat <vincent@bernat.ch>
Date: Thu, 8 Sep 2022 15:21:59 +0200
Subject: [PATCH 27/42] docs: mention SYNC modifier for DROP and DETACH
 statements

---
 docs/en/sql-reference/statements/detach.md |  4 +++-
 docs/en/sql-reference/statements/drop.md   | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md
index c7089b0714d..aa87b1ef613 100644
--- a/docs/en/sql-reference/statements/detach.md
+++ b/docs/en/sql-reference/statements/detach.md
@@ -10,7 +10,7 @@ Makes the server "forget" about the existence of a table, a materialized view, o
 **Syntax**
 
 ``` sql
-DETACH TABLE|VIEW|DICTIONARY [IF EXISTS] [db.]name [ON CLUSTER cluster] [PERMANENTLY]
+DETACH TABLE|VIEW|DICTIONARY [IF EXISTS] [db.]name [ON CLUSTER cluster] [PERMANENTLY] [SYNC]
 ```
 
 Detaching does not delete the data or metadata of a table, a materialized view or a dictionary. If an entity was not detached `PERMANENTLY`, on the next server launch the server will read the metadata and recall the table/view/dictionary again. If an entity was detached `PERMANENTLY`, there will be no automatic recall.
@@ -24,6 +24,8 @@ Note that you can not detach permanently the table which is already detached (te
 
 Also you can not [DROP](../../sql-reference/statements/drop#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query.
 
+The `SYNC` modifier executes the action without delay.
+
 **Example**
 
 Creating a table:
diff --git a/docs/en/sql-reference/statements/drop.md b/docs/en/sql-reference/statements/drop.md
index 28d379421f1..8a83a8fae1d 100644
--- a/docs/en/sql-reference/statements/drop.md
+++ b/docs/en/sql-reference/statements/drop.md
@@ -6,7 +6,7 @@ sidebar_label: DROP
 
 # DROP Statements
 
-Deletes existing entity. If the `IF EXISTS` clause is specified, these queries do not return an error if the entity does not exist.
+Deletes existing entity. If the `IF EXISTS` clause is specified, these queries do not return an error if the entity does not exist. If the `SYNC` modifier is specified, the entity is dropped without delay.
 
 ## DROP DATABASE
 
@@ -15,7 +15,7 @@ Deletes all tables inside the `db` database, then deletes the `db` database itse
 Syntax:
 
 ``` sql
-DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster]
+DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] [SYNC]
 ```
 
 ## DROP TABLE
@@ -25,7 +25,7 @@ Deletes the table.
 Syntax:
 
 ``` sql
-DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster]
+DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] [SYNC]
 ```
 
 ## DROP DICTIONARY
@@ -35,7 +35,7 @@ Deletes the dictionary.
 Syntax:
 
 ``` sql
-DROP DICTIONARY [IF EXISTS] [db.]name
+DROP DICTIONARY [IF EXISTS] [db.]name [SYNC]
 ```
 
 ## DROP USER
@@ -95,7 +95,7 @@ Deletes a view. Views can be deleted by a `DROP TABLE` command as well but `DROP
 Syntax:
 
 ``` sql
-DROP VIEW [IF EXISTS] [db.]name [ON CLUSTER cluster]
+DROP VIEW [IF EXISTS] [db.]name [ON CLUSTER cluster] [SYNC]
 ```
 
 ## DROP FUNCTION

From 3502718a2c7e842f78490f08c8b0b021f549072f Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Thu, 8 Sep 2022 13:52:14 +0000
Subject: [PATCH 28/42] Less complications

---
 src/Common/ZooKeeper/ZooKeeperCommon.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index 3aba0a0651d..4ab93d814df 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -902,17 +902,11 @@ PathMatchResult matchPath(std::string_view path, std::string_view match_to)
 {
     using enum PathMatchResult;
 
-    if (match_to == "/")
-        return path == "/" ? EXACT : IS_CHILD;
+    if (path.ends_with('/'))
+        path.remove_suffix(1);
 
-    const auto clean_path = [](auto & p)
-    {
-        if (p.ends_with('/'))
-            p.remove_suffix(1);
-    };
-
-    clean_path(path);
-    clean_path(match_to);
+    if (match_to.ends_with('/'))
+        match_to.remove_suffix(1);
 
     auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
 

From 9acc73d811474b61a164debc02a9b730ffab1eb9 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@yandex.ru>
Date: Thu, 8 Sep 2022 17:51:29 +0200
Subject: [PATCH 29/42] Update src/Storages/StorageReplicatedMergeTree.cpp

Co-authored-by: Alexander Tokmakov <tavplubix@gmail.com>
---
 src/Storages/StorageReplicatedMergeTree.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index d662682d228..93626f768be 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -8264,7 +8264,7 @@ void StorageReplicatedMergeTree::backupData(
     else
         data_parts = getVisibleDataPartsVector(local_context);
 
-    auto backup_entries = backupParts(data_parts, "", local_context);
+    auto backup_entries = backupParts(data_parts, /* data_path_in_backup */ "", local_context);
 
     auto coordination = backup_entries_collector.getBackupCoordination();
     String shared_id = getTableSharedID();

From e48cd2b6f6d70236d6b7b66dc47c1cb03d5d8488 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Thu, 8 Sep 2022 18:13:28 +0200
Subject: [PATCH 30/42] Add more test cases.

---
 .../test_concurrency.py                       | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
index fa6b09a89cf..6882cbff5e2 100644
--- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
@@ -178,12 +178,17 @@ def test_concurrent_backups_on_different_nodes():
         ("Ordinary", "MergeTree"),
         ("Atomic", "MergeTree"),
         ("Replicated", "ReplicatedMergeTree"),
+        ("Memory", "MergeTree"),
+        ("Lazy", "Log")
     ],
 )
 def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     if db_engine == "Replicated":
         db_engine = "Replicated('/clickhouse/path/','{shard}','{replica}')"
 
+    if db_engine == "Lazy":
+        db_engine = "Lazy(20)"
+
     if table_engine.endswith("MergeTree"):
         table_engine += " ORDER BY tuple()"
 
@@ -193,7 +198,7 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     start_time = time.time()
     end_time = start_time + 60
 
-    def create_table():
+    def create_tables():
         while time.time() < end_time:
             node = nodes[randint(0, num_nodes - 1)]
             table_name = f"mydb.tbl{randint(1, num_nodes)}"
@@ -204,13 +209,13 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
                 f"INSERT INTO {table_name} SELECT rand32() FROM numbers(10)"
             )
 
-    def drop_table():
+    def drop_tables():
         while time.time() < end_time:
             table_name = f"mydb.tbl{randint(1, num_nodes)}"
             node = nodes[randint(0, num_nodes - 1)]
             node.query(f"DROP TABLE IF EXISTS {table_name} NO DELAY")
 
-    def rename_table():
+    def rename_tables():
         while time.time() < end_time:
             table_name1 = f"mydb.tbl{randint(1, num_nodes)}"
             table_name2 = f"mydb.tbl{randint(1, num_nodes)}"
@@ -219,13 +224,13 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
                 f"RENAME TABLE {table_name1} TO {table_name2}"
             )
 
-    def truncate_table():
+    def truncate_tables():
         while time.time() < end_time:
             table_name = f"mydb.tbl{randint(1, num_nodes)}"
             node = nodes[randint(0, num_nodes - 1)]
             node.query(f"TRUNCATE TABLE IF EXISTS {table_name} NO DELAY")
 
-    def make_backup():
+    def make_backups():
         ids = []
         while time.time() < end_time:
             time.sleep(
@@ -241,12 +246,12 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     ids = []
     with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
         futures = []
-        ids_future = executor.submit(make_backup)
+        ids_future = executor.submit(make_backups)
         futures.append(ids_future)
-        futures.append(executor.submit(create_table))
-        futures.append(executor.submit(drop_table))
-        futures.append(executor.submit(rename_table))
-        futures.append(executor.submit(truncate_table))
+        futures.append(executor.submit(create_tables))
+        futures.append(executor.submit(drop_tables))
+        futures.append(executor.submit(rename_tables))
+        futures.append(executor.submit(truncate_tables))
         for future in futures:
             future.result()
         ids = ids_future.result()

From 10629a66e582c7a2250b01d7a1e4853223dcd631 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Thu, 8 Sep 2022 20:58:51 +0200
Subject: [PATCH 31/42] Fix black.

---
 .../test_backup_restore_on_cluster/test_concurrency.py          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
index 6882cbff5e2..dd8b6aa50da 100644
--- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
@@ -179,7 +179,7 @@ def test_concurrent_backups_on_different_nodes():
         ("Atomic", "MergeTree"),
         ("Replicated", "ReplicatedMergeTree"),
         ("Memory", "MergeTree"),
-        ("Lazy", "Log")
+        ("Lazy", "Log"),
     ],
 )
 def test_create_or_drop_tables_during_backup(db_engine, table_engine):

From 18f5b5e5b07dd02be68beca38484e85190568cee Mon Sep 17 00:00:00 2001
From: rfraposa <richraposa@gmail.com>
Date: Thu, 8 Sep 2022 16:05:56 -0600
Subject: [PATCH 32/42] Add docs for lightweight deletes

---
 .../sql-reference/statements/alter/delete.md  |  6 +++-
 docs/en/sql-reference/statements/delete.md    | 36 +++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 docs/en/sql-reference/statements/delete.md

diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md
index 809715b5423..1cd6d466788 100644
--- a/docs/en/sql-reference/statements/alter/delete.md
+++ b/docs/en/sql-reference/statements/alter/delete.md
@@ -12,7 +12,11 @@ ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr
 
 Deletes data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
 
-:::note    
+:::info
+The `ALTER TABLE` command is considered a heavyweight operation that requires the underlying data to be merged before it is deleted. For MergeTree tables, consider using the [`DELETE FROM` query](../delete.md), which performs a lightweight delete and can be considerably faster.
+:::
+
+:::note
 The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use.
 :::
 
diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
new file mode 100644
index 00000000000..982e8a0e2da
--- /dev/null
+++ b/docs/en/sql-reference/statements/delete.md
@@ -0,0 +1,36 @@
+---
+slug: /en/sql-reference/statements/delete
+sidebar_position: 36
+sidebar_label: DELETE
+---
+
+# DELETE Statement
+
+``` sql
+DELETE FROM [db.]table [WHERE expr]
+```
+
+For MergeTree tables, `DELETE FROM` performs a lightweight delete on the given table, which means that the deleted rows are marked as deleted immediately and deleted rows will be filtered out of all subsequent queries. The underlying data is permanently deleted whenever merges occur.
+
+For example, the following query deletes all rows from the `hits` table where the `Title` column contains the text `hello`:
+
+```sql
+DELETE FROM hits WHERE Title LIKE '%hello%';
+```
+
+
+:::note
+This feature is experimental and requires you to set `allow_experimental_lightweight_delete` to true:
+
+```sql
+SET allow_experimental_lightweight_delete = true;
+```
+
+:::
+
+The [traditional way to delete rows](./alter/delete.md) in ClickHouse was to use `ALTER TABLE ... DELETE`, which is still a valid method for deleting rows. However, in most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
+
+:::info
+Lightweight deletes are asynchronous by default. Set `mutations_sync` equal to 1 to wait for one replica to process the statement, and set `mutations_sync` to 2 to wait for all replicas.
+:::
+

From 927f763cbbb7a7059e9f06a139bd1ef51a0760f9 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 9 Sep 2022 10:12:32 +0200
Subject: [PATCH 33/42] tests: disable zero copy replication to suppress
 warning in 01650_fetch_patition_with_macro_in_zk_path_long

Fixes: #41108
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 .../01650_fetch_patition_with_macro_in_zk_path_long.sql     | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql b/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql
index 4357aa199dc..1dae8e7b383 100644
--- a/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql
+++ b/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql
@@ -5,13 +5,15 @@ DROP TABLE IF EXISTS restore_01640;
 
 CREATE TABLE test_01640(i Int64, d Date, s String)
 ENGINE = ReplicatedMergeTree('/clickhouse/{database}/{shard}/tables/test_01640','{replica}') 
-PARTITION BY toYYYYMM(d) ORDER BY i;
+PARTITION BY toYYYYMM(d) ORDER BY i
+SETTINGS allow_remote_fs_zero_copy_replication=0;
 
 insert into test_01640 values (1, '2021-01-01','some');
 
 CREATE TABLE restore_01640(i Int64, d Date, s String)
 ENGINE = ReplicatedMergeTree('/clickhouse/{database}/{shard}/tables/restore_01640','{replica}')
-PARTITION BY toYYYYMM(d) ORDER BY i;
+PARTITION BY toYYYYMM(d) ORDER BY i
+SETTINGS allow_remote_fs_zero_copy_replication=0;
 
 ALTER TABLE restore_01640 FETCH PARTITION tuple(toYYYYMM(toDate('2021-01-01')))
   FROM '/clickhouse/{database}/{shard}/tables/test_01640';

From e3fc78aa5fe50bcf23cf5e24241f4c21da4b381e Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 9 Sep 2022 12:14:42 +0200
Subject: [PATCH 34/42] Fix bad warnings in part fetches

---
 src/Storages/MergeTree/DataPartsExchange.cpp |  8 ++++----
 src/Storages/StorageReplicatedMergeTree.cpp  | 15 +++++++++++++++
 src/Storages/StorageReplicatedMergeTree.h    |  1 +
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp
index ecb692aaf9b..e10881c1eb3 100644
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@@ -422,8 +422,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
 
     const auto data_settings = data.getSettings();
 
-    if (data_settings->allow_remote_fs_zero_copy_replication && !try_zero_copy)
-        LOG_WARNING(log, "Zero copy replication enabled, but trying to fetch part {} without zero copy", part_name);
+    if (data.canUseZeroCopyReplication() && !try_zero_copy)
+        LOG_INFO(log, "Zero copy replication enabled, but trying to fetch part {} without zero copy", part_name);
 
     /// It should be "tmp-fetch_" and not "tmp_fetch_", because we can fetch part to detached/,
     /// but detached part name prefix should not contain underscore.
@@ -479,8 +479,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
     }
     else
     {
-        if (data_settings->allow_remote_fs_zero_copy_replication)
-            LOG_WARNING(log, "Cannot select any zero-copy disk for {}", part_name);
+        if (data.canUseZeroCopyReplication())
+            LOG_INFO(log, "Cannot select any zero-copy disk for {}", part_name);
 
         try_zero_copy = false;
     }
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 3491f6c9d4a..46401f08da1 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7336,6 +7336,21 @@ CheckResults StorageReplicatedMergeTree::checkData(const ASTPtr & query, Context
 }
 
 
+bool StorageReplicatedMergeTree::canUseZeroCopyReplication() const
+{
+    auto settings_ptr = getSettings();
+    if (!settings_ptr->allow_remote_fs_zero_copy_replication)
+        return false;
+
+    auto disks = getStoragePolicy()->getDisks();
+    for (const auto & disk : disks)
+    {
+        if (disk->supportZeroCopyReplication())
+            return true;
+    }
+    return false;
+}
+
 void StorageReplicatedMergeTree::checkBrokenDisks()
 {
     auto disks = getStoragePolicy()->getDisks();
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 79df4f11490..14def28309b 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -327,6 +327,7 @@ public:
     static bool removeSharedDetachedPart(DiskPtr disk, const String & path, const String & part_name, const String & table_uuid,
         const String & zookeeper_name, const String & replica_name, const String & zookeeper_path, ContextPtr local_context);
 
+    bool canUseZeroCopyReplication() const;
 private:
     std::atomic_bool are_restoring_replica {false};
 

From afe371776108a5dbf8c9ce13d546ef46921f3c72 Mon Sep 17 00:00:00 2001
From: DanRoscigno <dan@roscigno.com>
Date: Fri, 9 Sep 2022 08:34:42 -0400
Subject: [PATCH 35/42] move title to frontmatter to allow inclusion in other
 docs

---
 docs/en/getting-started/install.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index 92873cb5fbf..83561b07ade 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -4,10 +4,9 @@ sidebar_position: 1
 keywords: [clickhouse, install, installation, docs]
 description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.
 slug: /en/getting-started/install
+title: Installation
 ---
 
-# Installation
-
 ## System Requirements {#system-requirements}
 
 ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.

From 5bb40e5856ead49d8ccbfc8177255da00f96f88d Mon Sep 17 00:00:00 2001
From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com>
Date: Fri, 9 Sep 2022 15:04:32 +0200
Subject: [PATCH 36/42] Change cache setting do_not_evict_index_and_mark_files
 default to 0. (#41139)

---
 src/Interpreters/Cache/FileCacheSettings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/Cache/FileCacheSettings.cpp b/src/Interpreters/Cache/FileCacheSettings.cpp
index b08c80f20db..819eeaf4140 100644
--- a/src/Interpreters/Cache/FileCacheSettings.cpp
+++ b/src/Interpreters/Cache/FileCacheSettings.cpp
@@ -31,7 +31,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
     enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
     enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
 
-    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", true);
+    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", false);
 }
 
 }

From 50789126a83096f36c81df10fed9ecf5dbee30d0 Mon Sep 17 00:00:00 2001
From: Rich Raposa <richraposa@gmail.com>
Date: Fri, 9 Sep 2022 12:15:59 -0600
Subject: [PATCH 37/42] Update docs/en/sql-reference/statements/delete.md

Co-authored-by: Ivan Blinkov <github@blinkov.ru>
---
 docs/en/sql-reference/statements/delete.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 982e8a0e2da..b89be52606f 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -10,7 +10,7 @@ sidebar_label: DELETE
 DELETE FROM [db.]table [WHERE expr]
 ```
 
-For MergeTree tables, `DELETE FROM` performs a lightweight delete on the given table, which means that the deleted rows are marked as deleted immediately and deleted rows will be filtered out of all subsequent queries. The underlying data is permanently deleted whenever merges occur.
+`DELETE FROM` removes rows from table `[db.]table` that match expression `expr`. The deleted rows are marked as deleted immediately and will be automatically filtered out of all subsequent queries. Cleanup of data happens asynchronously in background. This feature is only available for MergeTree table engine family.
 
 For example, the following query deletes all rows from the `hits` table where the `Title` column contains the text `hello`:
 

From 9870957621dcaeeece4297d4db3f4bcb3675a996 Mon Sep 17 00:00:00 2001
From: Rich Raposa <richraposa@gmail.com>
Date: Fri, 9 Sep 2022 12:16:54 -0600
Subject: [PATCH 38/42] Update docs/en/sql-reference/statements/delete.md

Co-authored-by: Ivan Blinkov <github@blinkov.ru>
---
 docs/en/sql-reference/statements/delete.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index b89be52606f..07e70d1fd5d 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -28,7 +28,7 @@ SET allow_experimental_lightweight_delete = true;
 
 :::
 
-The [traditional way to delete rows](./alter/delete.md) in ClickHouse was to use `ALTER TABLE ... DELETE`, which is still a valid method for deleting rows. However, in most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
+An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
 
 :::info
 Lightweight deletes are asynchronous by default. Set `mutations_sync` equal to 1 to wait for one replica to process the statement, and set `mutations_sync` to 2 to wait for all replicas.

From 9d717d62e141e4db2a33a161c41548e79c2b26cd Mon Sep 17 00:00:00 2001
From: DanRoscigno <dan@roscigno.com>
Date: Fri, 9 Sep 2022 14:56:25 -0400
Subject: [PATCH 39/42] fix slug

---
 docs/zh/development/tests.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/zh/development/tests.md b/docs/zh/development/tests.md
index e6d5cf66de9..6f1118e5e63 100644
--- a/docs/zh/development/tests.md
+++ b/docs/zh/development/tests.md
@@ -1,5 +1,5 @@
 ---
-slug: /en/development/tests
+slug: /zh/development/tests
 sidebar_position: 70
 sidebar_label: Testing
 title: ClickHouse Testing

From 4af246a2e055772dab215187063131aa10ee37ec Mon Sep 17 00:00:00 2001
From: rfraposa <richraposa@gmail.com>
Date: Fri, 9 Sep 2022 13:59:21 -0600
Subject: [PATCH 40/42] Feedback

---
 docs/en/sql-reference/statements/alter/delete.md | 5 +----
 docs/en/sql-reference/statements/delete.md       | 5 +++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md
index 1cd6d466788..ba5d01d9b4d 100644
--- a/docs/en/sql-reference/statements/alter/delete.md
+++ b/docs/en/sql-reference/statements/alter/delete.md
@@ -12,12 +12,9 @@ ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr
 
 Deletes data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).
 
-:::info
-The `ALTER TABLE` command is considered a heavyweight operation that requires the underlying data to be merged before it is deleted. For MergeTree tables, consider using the [`DELETE FROM` query](../delete.md), which performs a lightweight delete and can be considerably faster.
-:::
 
 :::note
-The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use.
+The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use.  `ALTER TABLE` is considered a heavyweight operation that requires the underlying data to be merged before it is deleted. For MergeTree tables, consider using the [`DELETE FROM` query](../delete.md), which performs a lightweight delete and can be considerably faster.
 :::
 
 The `filter_expr` must be of type `UInt8`. The query deletes rows in the table for which this expression takes a non-zero value.
diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 07e70d1fd5d..492c59f525d 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -18,6 +18,7 @@ For example, the following query deletes all rows from the `hits` table where th
 DELETE FROM hits WHERE Title LIKE '%hello%';
 ```
 
+Lightweight deletes are asynchronous by default. Set `mutations_sync` equal to 1 to wait for one replica to process the statement, and set `mutations_sync` to 2 to wait for all replicas.
 
 :::note
 This feature is experimental and requires you to set `allow_experimental_lightweight_delete` to true:
@@ -30,7 +31,7 @@ SET allow_experimental_lightweight_delete = true;
 
 An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
 
-:::info
-Lightweight deletes are asynchronous by default. Set `mutations_sync` equal to 1 to wait for one replica to process the statement, and set `mutations_sync` to 2 to wait for all replicas.
+:::warn
+Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
 :::
 

From 726639484ac653d029420cee757e72805a8bbfb1 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <954088+devcrafter@users.noreply.github.com>
Date: Fri, 9 Sep 2022 22:15:38 +0200
Subject: [PATCH 41/42] Revert "Query plan optimization setting: read in window
 order"

---
 src/Interpreters/IInterpreterUnionOrSelectQuery.h          | 2 --
 src/Interpreters/InterpreterExplainQuery.cpp               | 4 ++--
 src/Processors/QueryPlan/Optimizations/Optimizations.h     | 2 +-
 .../Optimizations/QueryPlanOptimizationSettings.cpp        | 1 -
 .../Optimizations/QueryPlanOptimizationSettings.h          | 3 ---
 .../reuseStorageOrderingForWindowFunctions.cpp             | 7 ++++++-
 6 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/Interpreters/IInterpreterUnionOrSelectQuery.h b/src/Interpreters/IInterpreterUnionOrSelectQuery.h
index 6f893d4703e..a1c86f9de85 100644
--- a/src/Interpreters/IInterpreterUnionOrSelectQuery.h
+++ b/src/Interpreters/IInterpreterUnionOrSelectQuery.h
@@ -58,8 +58,6 @@ public:
     /// Add limits from external query.
     void addStorageLimits(const StorageLimitsList & limits);
 
-    ContextPtr getContext() const { return context; }
-
 protected:
     ASTPtr query_ptr;
     ContextMutablePtr context;
diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp
index 746d382198d..4799970b6a1 100644
--- a/src/Interpreters/InterpreterExplainQuery.cpp
+++ b/src/Interpreters/InterpreterExplainQuery.cpp
@@ -316,7 +316,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
             interpreter.buildQueryPlan(plan);
 
             if (settings.optimize)
-                plan.optimize(QueryPlanOptimizationSettings::fromContext(interpreter.getContext()));
+                plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext()));
 
             if (settings.json)
             {
@@ -326,7 +326,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
                 auto plan_array = std::make_unique<JSONBuilder::JSONArray>();
                 plan_array->add(std::move(plan_map));
 
-                auto format_settings = getFormatSettings(interpreter.getContext());
+                auto format_settings = getFormatSettings(getContext());
                 format_settings.json.quote_64bit_integers = false;
 
                 JSONBuilder::FormatSettings json_format_settings{.settings = format_settings};
diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h
index f45200f3026..904f30e84b0 100644
--- a/src/Processors/QueryPlan/Optimizations/Optimizations.h
+++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h
@@ -63,7 +63,7 @@ inline const auto & getOptimizations()
         {tryMergeExpressions, "mergeExpressions", &QueryPlanOptimizationSettings::optimize_plan},
         {tryPushDownFilter, "pushDownFilter", &QueryPlanOptimizationSettings::filter_push_down},
         {tryExecuteFunctionsAfterSorting, "liftUpFunctions", &QueryPlanOptimizationSettings::optimize_plan},
-        {tryReuseStorageOrderingForWindowFunctions, "reuseStorageOrderingForWindowFunctions", &QueryPlanOptimizationSettings::optimize_read_in_window_order}
+        {tryReuseStorageOrderingForWindowFunctions, "reuseStorageOrderingForWindowFunctions", &QueryPlanOptimizationSettings::optimize_plan}
     }};
 
     return optimizations;
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
index f9707b973e4..1472fb87a89 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
@@ -11,7 +11,6 @@ QueryPlanOptimizationSettings QueryPlanOptimizationSettings::fromSettings(const
     settings.optimize_plan = from.query_plan_enable_optimizations;
     settings.max_optimizations_to_apply = from.query_plan_max_optimizations_to_apply;
     settings.filter_push_down = from.query_plan_filter_push_down;
-    settings.optimize_read_in_window_order = from.optimize_read_in_window_order;
     return settings;
 }
 
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
index 99e52b60a73..b5a37bf69d6 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
@@ -21,9 +21,6 @@ struct QueryPlanOptimizationSettings
     /// If filter push down optimization is enabled.
     bool filter_push_down = true;
 
-    /// window functions read in order optimization
-    bool optimize_read_in_window_order = true;
-
     static QueryPlanOptimizationSettings fromSettings(const Settings & from);
     static QueryPlanOptimizationSettings fromContext(ContextPtr from);
 };
diff --git a/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
index 8377b62c947..401774b390e 100644
--- a/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
+++ b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
@@ -61,7 +61,12 @@ size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node,
         return 0;
     }
 
-    const auto context = read_from_merge_tree->getContext();
+    auto context = read_from_merge_tree->getContext();
+    if (!context->getSettings().optimize_read_in_window_order)
+    {
+        return 0;
+    }
+
     const auto & query_info = read_from_merge_tree->getQueryInfo();
     const auto * select_query = query_info.query->as<ASTSelectQuery>();
 

From b9e9d776f0b91291e00d37ab4a4ecb096c4a6dbf Mon Sep 17 00:00:00 2001
From: Rich Raposa <richraposa@gmail.com>
Date: Fri, 9 Sep 2022 15:55:57 -0600
Subject: [PATCH 42/42] Update delete.md

---
 docs/en/sql-reference/statements/delete.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 492c59f525d..487dfc87f9a 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -31,7 +31,7 @@ SET allow_experimental_lightweight_delete = true;
 
 An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
 
-:::warn
+:::warning
 Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
 :::