Bring back optimization for reading subcolumns of single column in Compact parts

This commit is contained in:
avogar 2024-11-22 15:30:34 +00:00
parent a2d37aba4d
commit 28534272c9
4 changed files with 37 additions and 13 deletions

View File

@ -9,6 +9,8 @@
#include <DataTypes/DataTypeString.h>
#include <IO/ReadBufferFromString.h>
#include <Common/logger_useful.h>
namespace DB
{

View File

@ -148,7 +148,8 @@ void MergeTreeReaderCompact::readData(
ColumnPtr & column,
size_t rows_to_read,
const InputStreamGetter & getter,
ISerialization::SubstreamsCache & cache)
ISerialization::SubstreamsCache & cache,
std::unordered_map<String, ColumnPtr> & columns_cache_for_subcolumns)
{
try
{
@ -171,6 +172,17 @@ void MergeTreeReaderCompact::readData(
const auto & type_in_storage = name_and_type.getTypeInStorage();
const auto & name_in_storage = name_and_type.getNameInStorage();
if (auto cache_for_subcolumns_it = columns_cache_for_subcolumns.find(name_in_storage); cache_for_subcolumns_it != columns_cache_for_subcolumns.end())
{
auto subcolumn = type_in_storage->getSubcolumn(name_and_type.getSubcolumnName(), cache_for_subcolumns_it->second);
/// TODO: Avoid extra copying.
if (column->empty())
column = IColumn::mutate(subcolumn);
else
column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size());
}
else
{
auto serialization = getSerializationInPart({name_in_storage, type_in_storage});
ColumnPtr temp_column = type_in_storage->createColumn(*serialization);
@ -182,6 +194,9 @@ void MergeTreeReaderCompact::readData(
column = subcolumn;
else
column->assumeMutable()->insertRangeFrom(*subcolumn, 0, subcolumn->size());
columns_cache_for_subcolumns[name_in_storage] = temp_column;
}
}
else
{

View File

@ -45,7 +45,8 @@ protected:
ColumnPtr & column,
size_t rows_to_read,
const InputStreamGetter & getter,
ISerialization::SubstreamsCache & cache);
ISerialization::SubstreamsCache & cache,
std::unordered_map<String, ColumnPtr> & columns_cache_for_subcolumns);
void readPrefix(
const NameAndTypePair & name_and_type,

View File

@ -29,6 +29,12 @@ try
/// Use cache to avoid reading the column with the same name twice.
/// It may happen if there are empty array Nested in the part.
ISerialization::SubstreamsCache cache;
/// If we need to read multiple subcolumns from a single column in storage,
/// we will read it this column only once and then reuse to extract all subcolumns.
/// We cannot use SubstreamsCache for it, because we may also read the full column itself
/// and it might me not empty inside res_columns (and SubstreamsCache contains the whole columns).
/// TODO: refactor the code in a way when we first read all full columns and then extract all subcolumns from them.
std::unordered_map<String, ColumnPtr> columns_cache_for_subcolumns;
for (size_t pos = 0; pos < num_columns; ++pos)
{
@ -56,7 +62,7 @@ try
};
readPrefix(columns_to_read[pos], buffer_getter, buffer_getter_for_prefix, columns_for_offsets[pos]);
readData(columns_to_read[pos], column, rows_to_read, buffer_getter, cache);
readData(columns_to_read[pos], column, rows_to_read, buffer_getter, cache, columns_cache_for_subcolumns);
}
++from_mark;