ClickHouse/src/Storages/MergeTree/IMergeTreeReader.cpp

271 lines
10 KiB
C++
Raw Normal View History

2019-10-10 16:30:30 +00:00
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeArray.h>
#include <Common/escapeForFileName.h>
#include <Compression/CachedCompressedReadBuffer.h>
#include <Columns/ColumnArray.h>
2020-02-25 08:53:14 +00:00
#include <Interpreters/inplaceBlockConversions.h>
2022-07-20 20:30:16 +00:00
#include <Interpreters/Context.h>
2019-10-10 16:30:30 +00:00
#include <Storages/MergeTree/IMergeTreeReader.h>
#include <Common/typeid_cast.h>
namespace DB
{
namespace
{
using OffsetColumns = std::map<std::string, ColumnPtr>;
}
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
IMergeTreeReader::IMergeTreeReader(
2022-09-05 16:55:00 +00:00
MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
const NamesAndTypesList & columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,
MarkCache * mark_cache_,
const MarkRanges & all_mark_ranges_,
const MergeTreeReaderSettings & settings_,
2019-10-10 16:30:30 +00:00
const ValueSizeMap & avg_value_size_hints_)
2022-09-05 16:55:00 +00:00
: data_part_info_for_read(data_part_info_for_read_)
, avg_value_size_hints(avg_value_size_hints_)
, uncompressed_cache(uncompressed_cache_)
, mark_cache(mark_cache_)
, settings(settings_)
, metadata_snapshot(metadata_snapshot_)
2019-10-10 16:30:30 +00:00
, all_mark_ranges(all_mark_ranges_)
2022-09-05 16:55:00 +00:00
, alter_conversions(data_part_info_for_read->getAlterConversions())
2022-07-28 13:10:43 +00:00
/// For wide parts convert plain arrays of Nested to subcolumns
/// to allow to use shared offset column from cache.
2022-09-05 16:55:00 +00:00
, requested_columns(data_part_info_for_read->isWidePart() ? Nested::convertToSubcolumns(columns_) : columns_)
, part_columns(data_part_info_for_read->isWidePart() ? Nested::collect(data_part_info_for_read->getColumns()) : data_part_info_for_read->getColumns())
2019-10-10 16:30:30 +00:00
{
2022-07-27 14:05:16 +00:00
columns_to_read.reserve(requested_columns.size());
serializations.reserve(requested_columns.size());
2019-10-10 16:30:30 +00:00
2022-07-27 14:05:16 +00:00
for (const auto & column : requested_columns)
2020-12-18 12:27:15 +00:00
{
2022-07-27 14:05:16 +00:00
columns_to_read.emplace_back(getColumnInPart(column));
serializations.emplace_back(getSerializationInPart(column));
2020-12-18 12:27:15 +00:00
}
2019-10-10 16:30:30 +00:00
}
const IMergeTreeReader::ValueSizeMap & IMergeTreeReader::getAvgValueSizeHints() const
{
return avg_value_size_hints;
}
void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows) const
2019-10-10 16:30:30 +00:00
{
try
{
NamesAndTypesList available_columns(columns_to_read.begin(), columns_to_read.end());
DB::fillMissingColumns(
res_columns, num_rows,
Nested::convertToSubcolumns(requested_columns),
Nested::convertToSubcolumns(available_columns),
partially_read_columns, metadata_snapshot);
should_evaluate_missing_defaults = std::any_of(
res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; });
2019-10-10 16:30:30 +00:00
}
catch (Exception & e)
{
/// Better diagnostics.
2022-09-05 16:55:00 +00:00
e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
2019-10-10 16:30:30 +00:00
throw;
}
}
void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const
2019-10-10 16:30:30 +00:00
{
try
{
2022-07-27 14:05:16 +00:00
size_t num_columns = requested_columns.size();
2019-10-10 16:30:30 +00:00
if (res_columns.size() != num_columns)
throw Exception("invalid number of columns passed to MergeTreeReader::fillMissingColumns. "
"Expected " + toString(num_columns) + ", "
"got " + toString(res_columns.size()), ErrorCodes::LOGICAL_ERROR);
2019-10-10 16:30:30 +00:00
/// Convert columns list to block.
2020-08-08 00:47:03 +00:00
/// TODO: rewrite with columns interface. It will be possible after changes in ExpressionActions.
2022-07-27 14:05:16 +00:00
auto name_and_type = requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
{
if (res_columns[pos] == nullptr)
continue;
2019-10-10 16:30:30 +00:00
additional_columns.insert({res_columns[pos], name_and_type->type, name_and_type->name});
}
2019-10-10 16:30:30 +00:00
2021-02-05 15:11:26 +00:00
auto dag = DB::evaluateMissingDefaults(
2022-09-05 16:55:00 +00:00
additional_columns, requested_columns, metadata_snapshot->getColumns(), data_part_info_for_read->getContext());
if (dag)
{
2022-02-14 19:05:30 +00:00
dag->addMaterializingOutputActions();
2021-03-04 17:38:12 +00:00
auto actions = std::make_shared<
ExpressionActions>(std::move(dag),
2022-09-05 16:55:00 +00:00
ExpressionActionsSettings::fromSettings(data_part_info_for_read->getContext()->getSettingsRef()));
actions->execute(additional_columns);
}
/// Move columns from block.
2022-07-27 14:05:16 +00:00
name_and_type = requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
res_columns[pos] = std::move(additional_columns.getByName(name_and_type->name).column);
2019-10-10 16:30:30 +00:00
}
catch (Exception & e)
{
/// Better diagnostics.
2022-09-05 16:55:00 +00:00
e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
2019-10-10 16:30:30 +00:00
throw;
}
}
2022-07-21 14:47:19 +00:00
String IMergeTreeReader::getColumnNameInPart(const NameAndTypePair & required_column) const
{
2020-12-22 15:03:48 +00:00
auto name_in_storage = required_column.getNameInStorage();
2020-11-27 11:00:33 +00:00
if (alter_conversions.isColumnRenamed(name_in_storage))
{
2022-07-21 14:47:19 +00:00
name_in_storage = alter_conversions.getColumnOldName(name_in_storage);
return Nested::concatenateName(name_in_storage, required_column.getSubcolumnName());
}
2020-11-27 11:00:33 +00:00
2022-07-21 14:47:19 +00:00
return required_column.name;
}
2020-12-18 20:09:34 +00:00
2022-07-21 14:47:19 +00:00
NameAndTypePair IMergeTreeReader::getColumnInPart(const NameAndTypePair & required_column) const
{
2022-07-27 14:05:16 +00:00
auto name_in_part = getColumnNameInPart(required_column);
auto column_in_part = part_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, name_in_part);
2022-07-21 14:47:19 +00:00
if (column_in_part)
return *column_in_part;
2020-11-27 11:00:33 +00:00
2022-07-21 14:47:19 +00:00
return required_column;
}
2021-03-09 14:46:52 +00:00
2022-07-27 14:05:16 +00:00
SerializationPtr IMergeTreeReader::getSerializationInPart(const NameAndTypePair & required_column) const
{
auto name_in_part = getColumnNameInPart(required_column);
auto column_in_part = part_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, name_in_part);
if (!column_in_part)
return IDataType::getSerialization(required_column);
2020-12-18 20:09:34 +00:00
2022-09-05 16:55:00 +00:00
const auto & infos = data_part_info_for_read->getSerializationInfos();
2022-07-27 14:05:16 +00:00
if (auto it = infos.find(column_in_part->getNameInStorage()); it != infos.end())
return IDataType::getSerialization(*column_in_part, *it->second);
2022-07-27 14:05:16 +00:00
return IDataType::getSerialization(*column_in_part);
}
void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
2020-02-25 08:53:14 +00:00
{
try
{
2022-07-27 14:05:16 +00:00
size_t num_columns = requested_columns.size();
2020-02-25 08:53:14 +00:00
if (res_columns.size() != num_columns)
{
throw Exception(
"Invalid number of columns passed to MergeTreeReader::performRequiredConversions. "
"Expected "
+ toString(num_columns)
+ ", "
"got "
+ toString(res_columns.size()),
ErrorCodes::LOGICAL_ERROR);
}
Block copy_block;
2022-07-27 14:05:16 +00:00
auto name_and_type = requested_columns.begin();
2020-02-25 08:53:14 +00:00
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
{
if (res_columns[pos] == nullptr)
continue;
2022-07-21 14:47:19 +00:00
copy_block.insert({res_columns[pos], getColumnInPart(*name_and_type).type, name_and_type->name});
2020-02-25 08:53:14 +00:00
}
2022-09-05 16:55:00 +00:00
DB::performRequiredConversions(copy_block, requested_columns, data_part_info_for_read->getContext());
2020-02-25 08:53:14 +00:00
/// Move columns from block.
2022-07-27 14:05:16 +00:00
name_and_type = requested_columns.begin();
2020-02-25 08:53:14 +00:00
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
res_columns[pos] = std::move(copy_block.getByName(name_and_type->name).column);
}
catch (Exception & e)
{
/// Better diagnostics.
2022-09-05 16:55:00 +00:00
e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
2020-02-25 08:53:14 +00:00
throw;
}
}
IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const NameAndTypePair & required_column) const
2020-06-01 17:52:09 +00:00
{
auto get_offsets_streams = [](const auto & serialization, const auto & name_in_storage)
2020-06-01 17:52:09 +00:00
{
Names offsets_streams;
serialization->enumerateStreams([&](const auto & subpath)
2020-06-01 17:52:09 +00:00
{
if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
return;
auto subname = ISerialization::getSubcolumnNameForStream(subpath);
auto full_name = Nested::concatenateName(name_in_storage, subname);
offsets_streams.push_back(full_name);
});
return offsets_streams;
};
auto required_name_in_storage = Nested::extractTableName(required_column.getNameInStorage());
auto required_offsets_streams = get_offsets_streams(getSerializationInPart(required_column), required_name_in_storage);
size_t max_matched_streams = 0;
ColumnPosition position;
/// Find column that has maximal number of matching
/// offsets columns with required_column.
2022-09-05 16:55:00 +00:00
for (const auto & part_column : data_part_info_for_read->getColumns())
{
auto name_in_storage = Nested::extractTableName(part_column.name);
if (name_in_storage != required_name_in_storage)
continue;
auto offsets_streams = get_offsets_streams(data_part_info_for_read->getSerialization(part_column), name_in_storage);
NameSet offsets_streams_set(offsets_streams.begin(), offsets_streams.end());
size_t i = 0;
for (; i < required_offsets_streams.size(); ++i)
{
if (!offsets_streams_set.contains(required_offsets_streams[i]))
break;
}
if (i && (!position || i > max_matched_streams))
{
max_matched_streams = i;
position = data_part_info_for_read->getColumnPosition(part_column.name);
}
2020-06-01 17:52:09 +00:00
}
return position;
2020-06-01 17:52:09 +00:00
}
void IMergeTreeReader::checkNumberOfColumns(size_t num_columns_to_read) const
2020-04-14 19:47:19 +00:00
{
2022-07-27 14:05:16 +00:00
if (num_columns_to_read != requested_columns.size())
2020-04-14 19:47:19 +00:00
throw Exception("invalid number of columns passed to MergeTreeReader::readRows. "
2022-07-27 14:05:16 +00:00
"Expected " + toString(requested_columns.size()) + ", "
2020-04-14 19:47:19 +00:00
"got " + toString(num_columns_to_read), ErrorCodes::LOGICAL_ERROR);
}
2019-10-10 16:30:30 +00:00
}