Bring back optimization for reading subcolumns of single column in Compact parts

2024-12-20 05:05:38 +00:00 · 2024-11-22 15:30:34 +00:00 · 2024-11-22 15:30:34 +00:00 · 28534272c9
commit 28534272c9
parent a2d37aba4d
4 changed files with 37 additions and 13 deletions
--- a/src/DataTypes/Serializations/SerializationObject.cpp
+++ b/src/DataTypes/Serializations/SerializationObject.cpp
@ -9,6 +9,8 @@
 #include <DataTypes/DataTypeString.h>
 #include <IO/ReadBufferFromString.h>

+#include <Common/logger_useful.h>
+
 namespace DB
 {

--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@ -148,7 +148,8 @@ void MergeTreeReaderCompact::readData(
    ColumnPtr & column,
    size_t rows_to_read,
    const InputStreamGetter & getter,
-    ISerialization::SubstreamsCache & cache)
+    ISerialization::SubstreamsCache & cache,
+    std::unordered_map<String, ColumnPtr> & columns_cache_for_subcolumns)
 {
    try
    {
@ -171,6 +172,17 @@ void MergeTreeReaderCompact::readData(
            const auto & type_in_storage = name_and_type.getTypeInStorage();
            const auto & name_in_storage = name_and_type.getNameInStorage();

+            if (auto cache_for_subcolumns_it = columns_cache_for_subcolumns.find(name_in_storage); cache_for_subcolumns_it != columns_cache_for_subcolumns.end())
+            {
+                auto subcolumn = type_in_storage->getSubcolumn(name_and_type.getSubcolumnName(), cache_for_subcolumns_it->second);
+                /// TODO: Avoid extra copying.
+                if (column->empty())
+                    column = IColumn::mutate(subcolumn);
+                else
+                    column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size());
+            }
+            else
+            {
                auto serialization = getSerializationInPart({name_in_storage, type_in_storage});
                ColumnPtr temp_column = type_in_storage->createColumn(*serialization);

@ -182,6 +194,9 @@ void MergeTreeReaderCompact::readData(
                    column = subcolumn;
                else
                    column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size());
+
+                columns_cache_for_subcolumns[name_in_storage] = temp_column;
+            }
        }
        else
        {
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.h
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h
@ -45,7 +45,8 @@ protected:
        ColumnPtr & column,
        size_t rows_to_read,
        const InputStreamGetter & getter,
-        ISerialization::SubstreamsCache & cache);
+        ISerialization::SubstreamsCache & cache,
+        std::unordered_map<String, ColumnPtr> & columns_cache_for_subcolumns);

    void readPrefix(
        const NameAndTypePair & name_and_type,
--- a/src/Storages/MergeTree/MergeTreeReaderCompactSingleBuffer.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompactSingleBuffer.cpp
@ -29,6 +29,12 @@ try
        /// Use cache to avoid reading the column with the same name twice.
        /// It may happen if there are empty array Nested in the part.
        ISerialization::SubstreamsCache cache;
+        /// If we need to read multiple subcolumns from a single column in storage,
+        /// we will read it this column only once and then reuse to extract all subcolumns.
+        /// We cannot use SubstreamsCache for it, because we may also read the full column itself
+        /// and it might me not empty inside res_columns (and SubstreamsCache contains the whole columns).
+        /// TODO: refactor the code in a way when we first read all full columns and then extract all subcolumns from them.
+        std::unordered_map<String, ColumnPtr> columns_cache_for_subcolumns;

        for (size_t pos = 0; pos < num_columns; ++pos)
        {
@ -56,7 +62,7 @@ try
            };

            readPrefix(columns_to_read[pos], buffer_getter, buffer_getter_for_prefix, columns_for_offsets[pos]);
-            readData(columns_to_read[pos], column, rows_to_read, buffer_getter, cache);
+            readData(columns_to_read[pos], column, rows_to_read, buffer_getter, cache, columns_cache_for_subcolumns);
        }

        ++from_mark;