diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp index 1b95fddee9f..805a11521b3 100644 --- a/src/DataTypes/Serializations/SerializationObject.cpp +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -9,6 +9,8 @@ #include #include +#include + namespace DB { diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index 7451374070c..06635c39838 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -148,7 +148,8 @@ void MergeTreeReaderCompact::readData( ColumnPtr & column, size_t rows_to_read, const InputStreamGetter & getter, - ISerialization::SubstreamsCache & cache) + ISerialization::SubstreamsCache & cache, + std::unordered_map & columns_cache_for_subcolumns) { try { @@ -171,17 +172,31 @@ void MergeTreeReaderCompact::readData( const auto & type_in_storage = name_and_type.getTypeInStorage(); const auto & name_in_storage = name_and_type.getNameInStorage(); - auto serialization = getSerializationInPart({name_in_storage, type_in_storage}); - ColumnPtr temp_column = type_in_storage->createColumn(*serialization); - - serialization->deserializeBinaryBulkWithMultipleStreams(temp_column, rows_to_read, deserialize_settings, deserialize_binary_bulk_state_map[name], nullptr); - auto subcolumn = type_in_storage->getSubcolumn(name_and_type.getSubcolumnName(), temp_column); - - /// TODO: Avoid extra copying. - if (column->empty()) - column = subcolumn; + if (auto cache_for_subcolumns_it = columns_cache_for_subcolumns.find(name_in_storage); cache_for_subcolumns_it != columns_cache_for_subcolumns.end()) + { + auto subcolumn = type_in_storage->getSubcolumn(name_and_type.getSubcolumnName(), cache_for_subcolumns_it->second); + /// TODO: Avoid extra copying. + if (column->empty()) + column = IColumn::mutate(subcolumn); + else + column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size()); + } else - column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size()); + { + auto serialization = getSerializationInPart({name_in_storage, type_in_storage}); + ColumnPtr temp_column = type_in_storage->createColumn(*serialization); + + serialization->deserializeBinaryBulkWithMultipleStreams(temp_column, rows_to_read, deserialize_settings, deserialize_binary_bulk_state_map[name], nullptr); + auto subcolumn = type_in_storage->getSubcolumn(name_and_type.getSubcolumnName(), temp_column); + + /// TODO: Avoid extra copying. + if (column->empty()) + column = subcolumn; + else + column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size()); + + columns_cache_for_subcolumns[name_in_storage] = temp_column; + } } else { diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h index 1c6bd1474e3..f18ee8808ec 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.h +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h @@ -45,7 +45,8 @@ protected: ColumnPtr & column, size_t rows_to_read, const InputStreamGetter & getter, - ISerialization::SubstreamsCache & cache); + ISerialization::SubstreamsCache & cache, + std::unordered_map & columns_cache_for_subcolumns); void readPrefix( const NameAndTypePair & name_and_type, diff --git a/src/Storages/MergeTree/MergeTreeReaderCompactSingleBuffer.cpp b/src/Storages/MergeTree/MergeTreeReaderCompactSingleBuffer.cpp index 649bcce1188..50224eba82d 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompactSingleBuffer.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompactSingleBuffer.cpp @@ -29,6 +29,12 @@ try /// Use cache to avoid reading the column with the same name twice. /// It may happen if there are empty array Nested in the part. ISerialization::SubstreamsCache cache; + /// If we need to read multiple subcolumns from a single column in storage, + /// we will read it this column only once and then reuse to extract all subcolumns. + /// We cannot use SubstreamsCache for it, because we may also read the full column itself + /// and it might me not empty inside res_columns (and SubstreamsCache contains the whole columns). + /// TODO: refactor the code in a way when we first read all full columns and then extract all subcolumns from them. + std::unordered_map columns_cache_for_subcolumns; for (size_t pos = 0; pos < num_columns; ++pos) { @@ -56,7 +62,7 @@ try }; readPrefix(columns_to_read[pos], buffer_getter, buffer_getter_for_prefix, columns_for_offsets[pos]); - readData(columns_to_read[pos], column, rows_to_read, buffer_getter, cache); + readData(columns_to_read[pos], column, rows_to_read, buffer_getter, cache, columns_cache_for_subcolumns); } ++from_mark;