polymorphic parts (development)

This commit is contained in:
CurtizJ 2019-11-20 16:33:41 +03:00
parent e1d13ea5b0
commit 426c62aafe
10 changed files with 118 additions and 123 deletions

View File

@ -291,9 +291,14 @@ void IMergeTreeDataPartWriter::calculateAndSerializeSkipIndices(
void IMergeTreeDataPartWriter::finishPrimaryIndexSerialization(MergeTreeData::DataPart::Checksums & checksums) void IMergeTreeDataPartWriter::finishPrimaryIndexSerialization(MergeTreeData::DataPart::Checksums & checksums)
{ {
std::cerr << "finishPrimaryIndexSerialization called...\n"; std::cerr << "finishPrimaryIndexSerialization called...\n";
bool write_final_mark = (with_final_mark && data_written);
if (write_final_mark && compute_granularity)
index_granularity.appendMark(0);
if (index_stream) if (index_stream)
{ {
if (with_final_mark && data_written) if (write_final_mark)
{ {
for (size_t j = 0; j < index_columns.size(); ++j) for (size_t j = 0; j < index_columns.size(); ++j)
{ {
@ -301,9 +306,6 @@ void IMergeTreeDataPartWriter::finishPrimaryIndexSerialization(MergeTreeData::Da
index_types[j]->serializeBinary(last_index_row[j], *index_stream); index_types[j]->serializeBinary(last_index_row[j], *index_stream);
} }
if (compute_granularity)
index_granularity.appendMark(0);
last_index_row.clear(); last_index_row.clear();
} }

View File

@ -65,32 +65,6 @@ static bool arrayHasNoElementsRead(const IColumn & column)
return last_offset != 0; return last_offset != 0;
} }
IMergeTreeReader::MarksPtr IMergeTreeReader::loadMarks(const String & mrk_path, const LoadFunc & load_func)
{
MarksPtr marks;
if (mark_cache)
{
auto key = mark_cache->hash(mrk_path);
if (settings.save_marks_in_cache)
{
marks = mark_cache->getOrSet(key, load_func);
}
else
{
marks = mark_cache->get(key);
if (!marks)
marks = load_func();
}
}
else
marks = load_func();
if (!marks)
throw Exception("Failed to load marks: " + mrk_path, ErrorCodes::LOGICAL_ERROR);
return marks;
}
void IMergeTreeReader::fillMissingColumns(Block & res, bool & should_reorder, bool & should_evaluate_missing_defaults, size_t num_rows) void IMergeTreeReader::fillMissingColumns(Block & res, bool & should_reorder, bool & should_evaluate_missing_defaults, size_t num_rows)
{ {

View File

@ -53,15 +53,10 @@ public:
return all_mark_ranges.back().begin; return all_mark_ranges.back().begin;
} }
using MarksPtr = MarkCache::MappedPtr;
MergeTreeData::DataPartPtr data_part; MergeTreeData::DataPartPtr data_part;
protected: protected:
using LoadFunc = std::function<MarksPtr()>;
MarksPtr loadMarks(const String & mrk_path, const LoadFunc & load_func);
/// avg_value_size_hints are used to reduce the number of reallocations when creating columns of variable size. /// avg_value_size_hints are used to reduce the number of reallocations when creating columns of variable size.
ValueSizeMap avg_value_size_hints; ValueSizeMap avg_value_size_hints;
/// Stores states for IDataType::deserializeBinaryBulk /// Stores states for IDataType::deserializeBinaryBulk

View File

@ -154,8 +154,6 @@ void MergeTreeDataPartWriterCompact::finishDataSerialization(IMergeTreeDataPart:
writeIntBinary(stream->plain_hashing.count(), stream->marks); writeIntBinary(stream->plain_hashing.count(), stream->marks);
writeIntBinary(stream->compressed.offset(), stream->marks); writeIntBinary(stream->compressed.offset(), stream->marks);
} }
if (compute_granularity)
index_granularity.appendMark(0);
} }
stream->finalize(); stream->finalize();

View File

@ -0,0 +1,51 @@
#include <Storages/MergeTree/MergeTreeMarksLoader.h>
namespace DB
{
MergeTreeMarksLoader::MergeTreeMarksLoader(
MarkCache * mark_cache_,
const String & mrk_path_,
const LoadFunc & load_func_,
bool save_marks_in_cache_,
size_t columns_num_)
: mark_cache(mark_cache_)
, mrk_path(mrk_path_)
, load_func(load_func_)
, save_marks_in_cache(save_marks_in_cache_)
, columns_num(columns_num_) {}
const MarkInCompressedFile & MergeTreeMarksLoader::getMark(size_t row_index, size_t column_index)
{
if (!marks)
loadMarks();
if (column_index >= columns_num)
throw Exception("", ErrorCodes::LOGICAL_ERROR);
return (*marks)[row_index * columns_num + column_index];
}
void MergeTreeMarksLoader::loadMarks()
{
if (mark_cache)
{
auto key = mark_cache->hash(mrk_path);
if (save_marks_in_cache)
{
marks = mark_cache->getOrSet(key, load_func);
}
else
{
marks = mark_cache->get(key);
if (!marks)
marks = load_func();
}
}
else
marks = load_func();
if (!marks)
throw Exception("Failed to load marks: " + mrk_path, ErrorCodes::LOGICAL_ERROR);
}
}

View File

@ -0,0 +1,35 @@
#include <Storages/MarkCache.h>
namespace DB
{
class MergeTreeMarksLoader
{
public:
using MarksPtr = MarkCache::MappedPtr;
using LoadFunc = std::function<MarksPtr()>;
MergeTreeMarksLoader() {}
MergeTreeMarksLoader(MarkCache * mark_cache_,
const String & mrk_path_,
const LoadFunc & load_func_,
bool save_marks_in_cache_,
size_t columns_num_ = 1);
const MarkInCompressedFile & getMark(size_t row_index, size_t column_index = 0);
bool initialized() const { return marks != nullptr; }
private:
MarkCache * mark_cache = nullptr;
String mrk_path;
LoadFunc load_func;
bool save_marks_in_cache = false;
size_t columns_num;
MarksPtr marks;
void loadMarks();
};
}

View File

@ -19,6 +19,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(const MergeTreeData::DataPartPtr
, uncompressed_cache_, mark_cache_, mark_ranges_ , uncompressed_cache_, mark_cache_, mark_ranges_
, settings_, avg_value_size_hints_) , settings_, avg_value_size_hints_)
{ {
initMarksLoader();
size_t buffer_size = settings.max_read_buffer_size; size_t buffer_size = settings.max_read_buffer_size;
if (uncompressed_cache) if (uncompressed_cache)
@ -121,13 +122,14 @@ void MergeTreeReaderCompact::readData(
} }
void MergeTreeReaderCompact::loadMarks() void MergeTreeReaderCompact::initMarksLoader()
{ {
const auto & index_granularity_info = data_part->index_granularity_info; const auto & index_granularity_info = data_part->index_granularity_info;
size_t marks_count = data_part->getMarksCount(); size_t marks_count = data_part->getMarksCount();
std::string mrk_path = index_granularity_info.getMarksFilePath(path + NAME_OF_FILE_WITH_DATA); std::string mrk_path = index_granularity_info.getMarksFilePath(path + NAME_OF_FILE_WITH_DATA);
size_t columns_num = data_part->columns.size();
auto load_func = [&]() -> MarkCache::MappedPtr auto load = [&]() -> MarkCache::MappedPtr
{ {
size_t file_size = Poco::File(mrk_path).getSize(); size_t file_size = Poco::File(mrk_path).getSize();
@ -140,7 +142,6 @@ void MergeTreeReaderCompact::loadMarks()
/// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache. /// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache.
auto temporarily_disable_memory_tracker = getCurrentMemoryTrackerActionLock(); auto temporarily_disable_memory_tracker = getCurrentMemoryTrackerActionLock();
size_t columns_num = data_part->columns.size();
auto res = std::make_shared<MarksInCompressedFile>(marks_count * columns_num); auto res = std::make_shared<MarksInCompressedFile>(marks_count * columns_num);
@ -168,25 +169,14 @@ void MergeTreeReaderCompact::loadMarks()
return res; return res;
}; };
std::cerr << "(MergeTreeReaderCompact::loadMarks) table: " << storage.getTableName() << ", part: " << path << "\n"; marks_loader = MergeTreeMarksLoader{mark_cache, mrk_path, load, settings.save_marks_in_cache, columns_num};
std::cerr << "(MergeTreeReaderCompact::loadMarks) start marks load..." << "\n";
auto marks_array = IMergeTreeReader::loadMarks(mrk_path, load_func);
marks = MarksInCompressedFileCompact(marks_array, columns.size());
std::cerr << "(MergeTreeReaderCompact::loadMarks) end marks load..." << "\n"; std::cerr << "(MergeTreeReaderCompact::loadMarks) end marks load..." << "\n";
} }
const MarkInCompressedFile & MergeTreeReaderCompact::getMark(size_t row, size_t col) void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index)
{ {
if (!marks.initialized()) MarkInCompressedFile mark = marks_loader.getMark(row_index, column_index);
loadMarks();
return marks.getMark(row, col);
}
void MergeTreeReaderCompact::seekToMark(size_t row, size_t col)
{
MarkInCompressedFile mark = getMark(row, col);
std::cerr << "(MergeTreeReaderCompact::seekToMark) mark: (" << mark.offset_in_compressed_file << ", " << mark.offset_in_decompressed_block << "\n"; std::cerr << "(MergeTreeReaderCompact::seekToMark) mark: (" << mark.offset_in_compressed_file << ", " << mark.offset_in_decompressed_block << "\n";
@ -201,7 +191,7 @@ void MergeTreeReaderCompact::seekToMark(size_t row, size_t col)
{ {
/// Better diagnostics. /// Better diagnostics.
if (e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND) if (e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND)
e.addMessage("(while seeking to mark (" + toString(row) + ", " + toString(col) + ")"); e.addMessage("(while seeking to mark (" + toString(row_index) + ", " + toString(column_index) + ")");
throw; throw;
} }

View File

@ -8,38 +8,6 @@
namespace DB namespace DB
{ {
class MarksInCompressedFileCompact
{
public:
using MarksPtr = MarkCache::MappedPtr;
MarksInCompressedFileCompact() = default;
MarksInCompressedFileCompact(const MarksPtr & data_, size_t columns_num_)
: data(data_), columns_num(columns_num_) {}
const MarkInCompressedFile & getMark(size_t index, size_t column) const
{
return (*data)[index * columns_num + column];
}
char * getRowAddress(size_t index) const
{
return reinterpret_cast<char *>(data->data() + index * columns_num);
}
size_t getRowSize() const
{
return sizeof(MarkInCompressedFile) * columns_num;
}
bool initialized() { return data != nullptr; }
private:
MarksPtr data;
size_t columns_num;
};
/// Reads the data between pairs of marks in the same part. When reading consecutive ranges, avoids unnecessary seeks. /// Reads the data between pairs of marks in the same part. When reading consecutive ranges, avoids unnecessary seeks.
/// When ranges are almost consecutive, seeks are fast because they are performed inside the buffer. /// When ranges are almost consecutive, seeks are fast because they are performed inside the buffer.
/// Avoids loading the marks file if it is not needed (e.g. when reading the whole part). /// Avoids loading the marks file if it is not needed (e.g. when reading the whole part).
@ -63,9 +31,9 @@ private:
std::unique_ptr<CachedCompressedReadBuffer> cached_buffer; std::unique_ptr<CachedCompressedReadBuffer> cached_buffer;
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer; std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer;
MarksInCompressedFileCompact marks; MergeTreeMarksLoader marks_loader;
void loadMarks(); void initMarksLoader();
void seekToStart(); void seekToStart();
void seekToMark(size_t row, size_t col); void seekToMark(size_t row, size_t col);
const MarkInCompressedFile & getMark(size_t row, size_t col); const MarkInCompressedFile & getMark(size_t row, size_t col);

View File

@ -32,6 +32,8 @@ MergeTreeReaderStream::MergeTreeReaderStream(
/// Care should be taken to not load marks when the part is empty (marks_count == 0). /// Care should be taken to not load marks when the part is empty (marks_count == 0).
initMarksLoader();
for (const auto & mark_range : all_mark_ranges) for (const auto & mark_range : all_mark_ranges)
{ {
size_t left_mark = mark_range.begin; size_t left_mark = mark_range.begin;
@ -41,10 +43,10 @@ MergeTreeReaderStream::MergeTreeReaderStream(
/// and we will use max_read_buffer_size for buffer size, thus avoiding the need to load marks. /// and we will use max_read_buffer_size for buffer size, thus avoiding the need to load marks.
/// If the end of range is inside the block, we will need to read it too. /// If the end of range is inside the block, we will need to read it too.
if (right_mark < marks_count && getMark(right_mark).offset_in_decompressed_block > 0) if (right_mark < marks_count && marks_loader.getMark(right_mark).offset_in_decompressed_block > 0)
{ {
while (right_mark < marks_count while (right_mark < marks_count
&& getMark(right_mark).offset_in_compressed_file == getMark(mark_range.end).offset_in_compressed_file) && marks_loader.getMark(right_mark).offset_in_compressed_file == marks_loader.getMark(mark_range.end).offset_in_compressed_file)
{ {
++right_mark; ++right_mark;
} }
@ -55,13 +57,13 @@ MergeTreeReaderStream::MergeTreeReaderStream(
/// If there are no marks after the end of range, just use file size /// If there are no marks after the end of range, just use file size
if (right_mark >= marks_count if (right_mark >= marks_count
|| (right_mark + 1 == marks_count || (right_mark + 1 == marks_count
&& getMark(right_mark).offset_in_compressed_file == getMark(mark_range.end).offset_in_compressed_file)) && marks_loader.getMark(right_mark).offset_in_compressed_file == marks_loader.getMark(mark_range.end).offset_in_compressed_file))
{ {
mark_range_bytes = file_size - (left_mark < marks_count ? getMark(left_mark).offset_in_compressed_file : 0); mark_range_bytes = file_size - (left_mark < marks_count ? marks_loader.getMark(left_mark).offset_in_compressed_file : 0);
} }
else else
{ {
mark_range_bytes = getMark(right_mark).offset_in_compressed_file - getMark(left_mark).offset_in_compressed_file; mark_range_bytes = marks_loader.getMark(right_mark).offset_in_compressed_file - marks_loader.getMark(left_mark).offset_in_compressed_file;
} }
max_mark_range_bytes = std::max(max_mark_range_bytes, mark_range_bytes); max_mark_range_bytes = std::max(max_mark_range_bytes, mark_range_bytes);
@ -101,16 +103,11 @@ MergeTreeReaderStream::MergeTreeReaderStream(
} }
const MarkInCompressedFile & MergeTreeReaderStream::getMark(size_t index) void MergeTreeReaderStream::initMarksLoader()
{ {
if (!marks) if (marks_loader.initialized())
loadMarks(); return;
return (*marks)[index];
}
void MergeTreeReaderStream::loadMarks()
{
std::string mrk_path = index_granularity_info->getMarksFilePath(path_prefix); std::string mrk_path = index_granularity_info->getMarksFilePath(path_prefix);
auto load = [&]() -> MarkCache::MappedPtr auto load = [&]() -> MarkCache::MappedPtr
@ -153,31 +150,13 @@ void MergeTreeReaderStream::loadMarks()
return res; return res;
}; };
if (mark_cache) marks_loader = MergeTreeMarksLoader{mark_cache, mrk_path, load, save_marks_in_cache};
{
auto key = mark_cache->hash(mrk_path);
if (save_marks_in_cache)
{
marks = mark_cache->getOrSet(key, load);
}
else
{
marks = mark_cache->get(key);
if (!marks)
marks = load();
}
}
else
marks = load();
if (!marks)
throw Exception("Failed to load marks: " + mrk_path, ErrorCodes::LOGICAL_ERROR);
} }
void MergeTreeReaderStream::seekToMark(size_t index) void MergeTreeReaderStream::seekToMark(size_t index)
{ {
MarkInCompressedFile mark = getMark(index); MarkInCompressedFile mark = marks_loader.getMark(index);
try try
{ {

View File

@ -6,6 +6,7 @@
#include <Compression/CachedCompressedReadBuffer.h> #include <Compression/CachedCompressedReadBuffer.h>
#include <Compression/CompressedReadBufferFromFile.h> #include <Compression/CompressedReadBufferFromFile.h>
#include <Storages/MergeTree/MergeTreeReaderSettings.h> #include <Storages/MergeTree/MergeTreeReaderSettings.h>
#include <Storages/MergeTree/MergeTreeMarksLoader.h>
namespace DB namespace DB
@ -33,7 +34,7 @@ private:
/// NOTE: lazily loads marks from the marks cache. /// NOTE: lazily loads marks from the marks cache.
const MarkInCompressedFile & getMark(size_t index); const MarkInCompressedFile & getMark(size_t index);
void loadMarks(); void initMarksLoader();
std::string path_prefix; std::string path_prefix;
std::string data_file_extension; std::string data_file_extension;
@ -48,5 +49,7 @@ private:
std::unique_ptr<CachedCompressedReadBuffer> cached_buffer; std::unique_ptr<CachedCompressedReadBuffer> cached_buffer;
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer; std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer;
MergeTreeMarksLoader marks_loader;
}; };
} }