fix reading of empty Nested(Array(...))

This commit is contained in:
Anton Popov 2023-08-04 13:10:50 +00:00
parent d43e76b147
commit 991abde851
5 changed files with 62 additions and 21 deletions

View File

@ -242,7 +242,7 @@ IMergeTreeReader::ColumnNameLevel IMergeTreeReader::findColumnForOffsets(const N
/// Find column that has maximal number of matching
/// offsets columns with required_column.
for (const auto & part_column : data_part_info_for_read->getColumns())
for (const auto & part_column : Nested::convertToSubcolumns(data_part_info_for_read->getColumns()))
{
auto name_in_storage = Nested::extractTableName(part_column.name);
if (name_in_storage != required_name_in_storage)

View File

@ -105,10 +105,10 @@ protected:
NameSet partially_read_columns;
private:
/// Alter conversions, which must be applied on fly if required
AlterConversionsPtr alter_conversions;
private:
/// Columns that are requested to read.
NamesAndTypesList requested_columns;

View File

@ -149,11 +149,34 @@ void MergeTreeReaderCompact::fillColumnPositions()
position.reset();
}
/// If array of Nested column is missing in part,
/// we have to read its offsets if they exist.
if (!position && is_array)
{
/// If array of Nested column is missing in part,
/// we have to read its offsets if they exist.
auto name_level_for_offsets = findColumnForOffsets(column_to_read);
NameAndTypePair column_to_read_with_subcolumns = column_to_read;
auto [name_in_storage, subcolumn_name] = Nested::splitName(column_to_read.name);
/// If it is a part of Nested, we need to get the column from
/// storage metatadata which is converted to Nested type with subcolumns.
/// It is required for proper counting of shared streams.
if (!subcolumn_name.empty())
{
/// If column is renamed get the new name from storage metadata.
if (alter_conversions->columnHasNewName(name_in_storage))
name_in_storage = alter_conversions->getColumnNewName(name_in_storage);
if (!storage_columns_with_collected_nested)
storage_columns_with_collected_nested = ColumnsDescription(
Nested::collect(metadata_snapshot->getColumns().getAllPhysical()));
column_to_read_with_subcolumns = storage_columns_with_collected_nested
->getColumnOrSubcolumn(
GetColumnsOptions::All,
Nested::concatenateName(name_in_storage, subcolumn_name));
}
auto name_level_for_offsets = findColumnForOffsets(column_to_read_with_subcolumns);
if (name_level_for_offsets.has_value())
{
column_positions[i] = data_part_info_for_read->getColumnPosition(name_level_for_offsets->first);
@ -162,7 +185,9 @@ void MergeTreeReaderCompact::fillColumnPositions()
}
}
else
{
column_positions[i] = std::move(position);
}
}
}
@ -297,6 +322,8 @@ void MergeTreeReaderCompact::readData(
};
ISerialization::DeserializeBinaryBulkStatePtr state;
ISerialization::DeserializeBinaryBulkStatePtr state_for_prefix;
ISerialization::DeserializeBinaryBulkSettings deserialize_settings;
deserialize_settings.avg_value_size_hint = avg_value_size_hints[name];
@ -306,14 +333,18 @@ void MergeTreeReaderCompact::readData(
/// In case of reading onlys offset use the correct serialization for reading of the prefix
auto serialization = getSerializationInPart(name_type_in_storage);
auto serialization_for_prefix = column_for_offsets ? getSerializationInPart(*column_for_offsets) : serialization;
ColumnPtr temp_column = name_type_in_storage.type->createColumn(*serialization);
deserialize_settings.getter = buffer_getter_for_prefix;
serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state);
if (column_for_offsets)
{
auto serialization_for_prefix = getSerializationInPart(*column_for_offsets);
deserialize_settings.getter = buffer_getter_for_prefix;
serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state_for_prefix);
}
deserialize_settings.getter = buffer_getter;
serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, state);
serialization->deserializeBinaryBulkWithMultipleStreams(temp_column, rows_to_read, deserialize_settings, state, nullptr);
auto subcolumn = name_type_in_storage.type->getSubcolumn(name_and_type.getSubcolumnName(), temp_column);
@ -328,12 +359,17 @@ void MergeTreeReaderCompact::readData(
{
/// In case of reading only offsets use the correct serialization for reading the prefix
auto serialization = getSerializationInPart(name_and_type);
auto serialization_for_prefix = column_for_offsets ? getSerializationInPart(*column_for_offsets) : serialization;
deserialize_settings.getter = buffer_getter_for_prefix;
serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state);
if (column_for_offsets)
{
auto serialization_for_prefix = getSerializationInPart(*column_for_offsets);
deserialize_settings.getter = buffer_getter_for_prefix;
serialization_for_prefix->deserializeBinaryBulkStatePrefix(deserialize_settings, state_for_prefix);
}
deserialize_settings.getter = buffer_getter;
serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, state);
serialization->deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state, nullptr);
}

View File

@ -52,6 +52,12 @@ private:
MergeTreeMarksLoader marks_loader;
/// Storage columns with collected separate arrays of Nested to columns of Nested type.
/// They maybe be needed for finding offsets of missed Nested columns in parts.
/// They are rarely used and are heavy to initialized, so we create them
/// only on demand and cache in this field.
std::optional<ColumnsDescription> storage_columns_with_collected_nested;
/// Positions of columns in part structure.
using ColumnPositions = std::vector<std::optional<size_t>>;
ColumnPositions column_positions;
@ -85,7 +91,6 @@ private:
ReadBufferFromFileBase::ProfileCallback profile_callback;
clockid_t clock_type;
bool initialized = false;
};

View File

@ -10,14 +10,14 @@
['0','1','2','3','4','5','6','7','8'] ['','','','','','','','','']
[] []
[[]] [[]]
[[],['0']] [[],['']]
[[],['0'],['0','1']] [[],[''],['','']]
[[],['0'],['0','1'],['0','1','2']] [[],[''],['',''],['','','']]
[[],['0'],['0','1'],['0','1','2'],[]] [[],[''],['',''],['','',''],[]]
[[],['0'],['0','1'],['0','1','2'],[],['0']] [[],[''],['',''],['','',''],[],['']]
[[],['0'],['0','1'],['0','1','2'],[],['0'],['0','1']] [[],[''],['',''],['','',''],[],[''],['','']]
[[],['0'],['0','1'],['0','1','2'],[],['0'],['0','1'],['0','1','2']] [[],[''],['',''],['','',''],[],[''],['',''],['','','']]
[[],['0'],['0','1'],['0','1','2'],[],['0'],['0','1'],['0','1','2'],[]] [[],[''],['',''],['','',''],[],[''],['',''],['','',''],[]]
[[],['0']] [[],[]]
[[],['0'],['0','1']] [[],[],[]]
[[],['0'],['0','1'],['0','1','2']] [[],[],[],[]]
[[],['0'],['0','1'],['0','1','2'],[]] [[],[],[],[],[]]
[[],['0'],['0','1'],['0','1','2'],[],['0']] [[],[],[],[],[],[]]
[[],['0'],['0','1'],['0','1','2'],[],['0'],['0','1']] [[],[],[],[],[],[],[]]
[[],['0'],['0','1'],['0','1','2'],[],['0'],['0','1'],['0','1','2']] [[],[],[],[],[],[],[],[]]
[[],['0'],['0','1'],['0','1','2'],[],['0'],['0','1'],['0','1','2'],[]] [[],[],[],[],[],[],[],[],[]]
[] []
[{}] [{}]
[{},{'k0':0}] [{},{}]