ClickHouse/src/Storages/MergeTree/MergeTreeReaderCompact.cpp

247 lines
8.6 KiB
C++
Raw Normal View History

2019-10-11 15:37:16 +00:00
#include <Storages/MergeTree/MergeTreeReaderCompact.h>
2019-12-16 14:51:19 +00:00
#include <Storages/MergeTree/MergeTreeDataPartCompact.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/NestedUtils.h>
2019-10-11 15:37:16 +00:00
#include <Poco/File.h>
namespace DB
{
2019-10-16 18:27:53 +00:00
namespace ErrorCodes
{
2020-02-25 18:02:41 +00:00
extern const int CANNOT_READ_ALL_DATA;
2019-10-16 18:27:53 +00:00
extern const int ARGUMENT_OUT_OF_BOUND;
}
MergeTreeReaderCompact::MergeTreeReaderCompact(
DataPartCompactPtr data_part_,
NamesAndTypesList columns_,
const StorageMetadataPtr & metadata_snapshot_,
UncompressedCache * uncompressed_cache_,
MarkCache * mark_cache_,
MarkRanges mark_ranges_,
MergeTreeReaderSettings settings_,
ValueSizeMap avg_value_size_hints_,
const ReadBufferFromFileBase::ProfileCallback & profile_callback_,
clockid_t clock_type_)
: IMergeTreeReader(
std::move(data_part_),
std::move(columns_),
metadata_snapshot_,
uncompressed_cache_,
mark_cache_,
std::move(mark_ranges_),
std::move(settings_),
std::move(avg_value_size_hints_))
, marks_loader(
data_part->volume->getDisk(),
mark_cache,
data_part->index_granularity_info.getMarksFilePath(data_part->getFullRelativePath() + MergeTreeDataPartCompact::DATA_FILE_NAME),
data_part->getMarksCount(),
data_part->index_granularity_info,
settings.save_marks_in_cache,
data_part->getColumns().size())
2019-10-11 15:37:16 +00:00
{
2019-10-16 18:27:53 +00:00
size_t buffer_size = settings.max_read_buffer_size;
const String full_data_path = data_part->getFullRelativePath() + MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION;
file_in = data_part->volume->getDisk()->readFile(
full_data_path, buffer_size, 0,
settings.min_bytes_to_use_direct_io,
settings.min_bytes_to_use_mmap_io);
2019-10-16 18:27:53 +00:00
auto full_path = fullPath(data_part->volume->getDisk(), full_data_path);
for (const auto & column : columns)
2019-10-16 18:27:53 +00:00
{
std::unique_ptr<CachedCompressedReadBuffer> cached_buffer;
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer;
if (uncompressed_cache)
{
cached_buffer = std::make_unique<CachedCompressedReadBuffer>(full_path, file_in.get(), uncompressed_cache);
if (profile_callback_)
cached_buffer->setProfileCallback(profile_callback_, clock_type_);
}
else
{
non_cached_buffer = std::make_unique<CompressedReadBufferFromFile>(*file_in);
if (profile_callback_)
non_cached_buffer->setProfileCallback(profile_callback_, clock_type_);
}
2019-10-16 18:27:53 +00:00
column_streams[column.name] = ColumnStream{std::move(cached_buffer), std::move(non_cached_buffer)};
2019-10-16 18:27:53 +00:00
}
2019-12-12 18:55:19 +00:00
size_t columns_num = columns.size();
column_positions.resize(columns_num);
read_only_offsets.resize(columns_num);
auto name_and_type = columns.begin();
for (size_t i = 0; i < columns_num; ++i, ++name_and_type)
{
const auto & [name, type] = getColumnFromPart(*name_and_type);
auto position = data_part->getColumnPosition(name);
if (!position && typeid_cast<const DataTypeArray *>(type.get()))
{
2020-04-02 18:24:11 +00:00
/// If array of Nested column is missing in part,
2020-06-01 17:52:09 +00:00
/// we have to read its offsets if they exist.
position = findColumnForOffsets(name);
read_only_offsets[i] = (position != std::nullopt);
}
column_positions[i] = std::move(position);
}
2019-10-11 15:37:16 +00:00
}
2019-12-19 13:10:57 +00:00
size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
2019-10-11 15:37:16 +00:00
{
2019-11-28 20:14:41 +00:00
if (continue_reading)
from_mark = next_mark;
2019-10-16 18:27:53 +00:00
2019-11-28 20:14:41 +00:00
size_t read_rows = 0;
2019-12-12 18:55:19 +00:00
size_t num_columns = columns.size();
2020-04-14 19:47:19 +00:00
checkNumberOfColumns(num_columns);
2019-12-12 18:55:19 +00:00
2020-01-15 16:39:29 +00:00
MutableColumns mutable_columns(num_columns);
auto column_it = columns.begin();
for (size_t i = 0; i < num_columns; ++i, ++column_it)
{
if (!column_positions[i])
continue;
bool append = res_columns[i] != nullptr;
if (!append)
res_columns[i] = getColumnFromPart(*column_it).type->createColumn();
2020-01-15 16:39:29 +00:00
mutable_columns[i] = res_columns[i]->assumeMutable();
}
2019-11-27 11:35:27 +00:00
while (read_rows < max_rows_to_read)
2019-10-16 18:27:53 +00:00
{
2019-11-27 11:35:27 +00:00
size_t rows_to_read = data_part->index_granularity.getMarkRows(from_mark);
2019-10-16 18:27:53 +00:00
2019-12-19 13:10:57 +00:00
auto name_and_type = columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
2019-10-16 18:27:53 +00:00
{
2020-01-15 16:39:29 +00:00
if (!res_columns[pos])
2019-12-12 18:55:19 +00:00
continue;
auto [name, type] = getColumnFromPart(*name_and_type);
2020-01-15 16:39:29 +00:00
auto & column = mutable_columns[pos];
2019-11-27 11:35:27 +00:00
try
{
2019-11-28 20:14:41 +00:00
size_t column_size_before_reading = column->size();
2019-12-12 18:55:19 +00:00
2020-01-17 12:24:27 +00:00
readData(name, *column, *type, from_mark, *column_positions[pos], rows_to_read, read_only_offsets[pos]);
size_t read_rows_in_column = column->size() - column_size_before_reading;
2020-01-15 16:39:29 +00:00
2019-11-28 20:14:41 +00:00
if (read_rows_in_column < rows_to_read)
throw Exception("Cannot read all data in MergeTreeReaderCompact. Rows read: " + toString(read_rows_in_column) +
2019-12-12 18:55:19 +00:00
". Rows expected: " + toString(rows_to_read) + ".", ErrorCodes::CANNOT_READ_ALL_DATA);
2019-11-27 11:35:27 +00:00
}
catch (Exception & e)
{
/// Better diagnostics.
2019-12-19 13:10:57 +00:00
e.addMessage("(while reading column " + name + ")");
2019-11-27 11:35:27 +00:00
throw;
}
2019-10-16 18:27:53 +00:00
}
2019-11-27 11:35:27 +00:00
++from_mark;
read_rows += rows_to_read;
2019-10-16 18:27:53 +00:00
}
2020-01-15 16:39:29 +00:00
for (size_t i = 0; i < num_columns; ++i)
{
auto & column = mutable_columns[i];
2020-03-09 02:55:28 +00:00
if (column && !column->empty())
2020-01-15 16:39:29 +00:00
res_columns[i] = std::move(column);
else
res_columns[i] = nullptr;
}
2019-11-28 20:14:41 +00:00
next_mark = from_mark;
2019-11-27 11:35:27 +00:00
2019-10-16 18:27:53 +00:00
return read_rows;
2019-10-11 15:37:16 +00:00
}
2019-10-16 18:27:53 +00:00
void MergeTreeReaderCompact::readData(
2020-01-17 12:24:27 +00:00
const String & name, IColumn & column, const IDataType & type,
size_t from_mark, size_t column_position, size_t rows_to_read, bool only_offsets)
2019-10-16 18:27:53 +00:00
{
auto & stream = column_streams[name];
if (!isContinuousReading(from_mark, column_position))
seekToMark(stream, from_mark, column_position);
auto buffer_getter = [&](const IDataType::SubstreamPath & substream_path) -> ReadBuffer *
{
if (only_offsets && (substream_path.size() != 1 || substream_path[0].type != IDataType::Substream::ArraySizes))
return nullptr;
return stream.data_buffer;
};
2019-10-16 18:27:53 +00:00
IDataType::DeserializeBinaryBulkSettings deserialize_settings;
deserialize_settings.getter = buffer_getter;
2020-01-17 12:24:27 +00:00
deserialize_settings.avg_value_size_hint = avg_value_size_hints[name];
2019-12-02 15:21:07 +00:00
deserialize_settings.position_independent_encoding = true;
2019-10-16 18:27:53 +00:00
IDataType::DeserializeBinaryBulkStatePtr state;
type.deserializeBinaryBulkStatePrefix(deserialize_settings, state);
type.deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state);
/// The buffer is left in inconsistent state after reading single offsets
if (only_offsets)
last_read_granule.reset();
else
last_read_granule.emplace(from_mark, column_position);
2019-10-16 18:27:53 +00:00
}
void MergeTreeReaderCompact::seekToMark(ColumnStream & stream, size_t row_index, size_t column_index)
2019-10-16 18:27:53 +00:00
{
2019-11-20 13:33:41 +00:00
MarkInCompressedFile mark = marks_loader.getMark(row_index, column_index);
2019-10-16 18:27:53 +00:00
try
{
if (stream.cached_buffer)
stream.cached_buffer->seek(mark.offset_in_compressed_file, mark.offset_in_decompressed_block);
if (stream.non_cached_buffer)
stream.non_cached_buffer->seek(mark.offset_in_compressed_file, mark.offset_in_decompressed_block);
2019-10-16 18:27:53 +00:00
}
catch (Exception & e)
{
/// Better diagnostics.
if (e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND)
2019-11-20 13:33:41 +00:00
e.addMessage("(while seeking to mark (" + toString(row_index) + ", " + toString(column_index) + ")");
2019-10-16 18:27:53 +00:00
throw;
}
}
bool MergeTreeReaderCompact::isContinuousReading(size_t mark, size_t column_position)
2019-10-16 18:27:53 +00:00
{
if (!last_read_granule)
return false;
const auto & [last_mark, last_column] = *last_read_granule;
return (mark == last_mark && column_position == last_column + 1)
2020-01-16 16:15:01 +00:00
|| (mark == last_mark + 1 && column_position == 0 && last_column == data_part->getColumns().size() - 1);
2019-10-16 18:27:53 +00:00
}
MergeTreeReaderCompact::ColumnStream::ColumnStream(
std::unique_ptr<CachedCompressedReadBuffer> cached_buffer_,
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer_)
: cached_buffer(std::move(cached_buffer_))
, non_cached_buffer(std::move(non_cached_buffer_))
{
if (cached_buffer)
data_buffer = cached_buffer.get();
else
data_buffer = non_cached_buffer.get();
}
2019-10-11 15:37:16 +00:00
}