polymorphic parts (development)

This commit is contained in:
CurtizJ 2019-11-20 16:33:41 +03:00
parent e1d13ea5b0
commit 426c62aafe
10 changed files with 118 additions and 123 deletions

View File

@ -291,9 +291,14 @@ void IMergeTreeDataPartWriter::calculateAndSerializeSkipIndices(
void IMergeTreeDataPartWriter::finishPrimaryIndexSerialization(MergeTreeData::DataPart::Checksums & checksums)
{
std::cerr << "finishPrimaryIndexSerialization called...\n";
bool write_final_mark = (with_final_mark && data_written);
if (write_final_mark && compute_granularity)
index_granularity.appendMark(0);
if (index_stream)
{
if (with_final_mark && data_written)
if (write_final_mark)
{
for (size_t j = 0; j < index_columns.size(); ++j)
{
@ -301,9 +306,6 @@ void IMergeTreeDataPartWriter::finishPrimaryIndexSerialization(MergeTreeData::Da
index_types[j]->serializeBinary(last_index_row[j], *index_stream);
}
if (compute_granularity)
index_granularity.appendMark(0);
last_index_row.clear();
}

View File

@ -65,32 +65,6 @@ static bool arrayHasNoElementsRead(const IColumn & column)
return last_offset != 0;
}
IMergeTreeReader::MarksPtr IMergeTreeReader::loadMarks(const String & mrk_path, const LoadFunc & load_func)
{
MarksPtr marks;
if (mark_cache)
{
auto key = mark_cache->hash(mrk_path);
if (settings.save_marks_in_cache)
{
marks = mark_cache->getOrSet(key, load_func);
}
else
{
marks = mark_cache->get(key);
if (!marks)
marks = load_func();
}
}
else
marks = load_func();
if (!marks)
throw Exception("Failed to load marks: " + mrk_path, ErrorCodes::LOGICAL_ERROR);
return marks;
}
void IMergeTreeReader::fillMissingColumns(Block & res, bool & should_reorder, bool & should_evaluate_missing_defaults, size_t num_rows)
{

View File

@ -53,15 +53,10 @@ public:
return all_mark_ranges.back().begin;
}
using MarksPtr = MarkCache::MappedPtr;
MergeTreeData::DataPartPtr data_part;
protected:
using LoadFunc = std::function<MarksPtr()>;
MarksPtr loadMarks(const String & mrk_path, const LoadFunc & load_func);
/// avg_value_size_hints are used to reduce the number of reallocations when creating columns of variable size.
ValueSizeMap avg_value_size_hints;
/// Stores states for IDataType::deserializeBinaryBulk

View File

@ -154,8 +154,6 @@ void MergeTreeDataPartWriterCompact::finishDataSerialization(IMergeTreeDataPart:
writeIntBinary(stream->plain_hashing.count(), stream->marks);
writeIntBinary(stream->compressed.offset(), stream->marks);
}
if (compute_granularity)
index_granularity.appendMark(0);
}
stream->finalize();

View File

@ -0,0 +1,51 @@
#include <Storages/MergeTree/MergeTreeMarksLoader.h>
namespace DB
{
MergeTreeMarksLoader::MergeTreeMarksLoader(
MarkCache * mark_cache_,
const String & mrk_path_,
const LoadFunc & load_func_,
bool save_marks_in_cache_,
size_t columns_num_)
: mark_cache(mark_cache_)
, mrk_path(mrk_path_)
, load_func(load_func_)
, save_marks_in_cache(save_marks_in_cache_)
, columns_num(columns_num_) {}
const MarkInCompressedFile & MergeTreeMarksLoader::getMark(size_t row_index, size_t column_index)
{
if (!marks)
loadMarks();
if (column_index >= columns_num)
throw Exception("", ErrorCodes::LOGICAL_ERROR);
return (*marks)[row_index * columns_num + column_index];
}
void MergeTreeMarksLoader::loadMarks()
{
if (mark_cache)
{
auto key = mark_cache->hash(mrk_path);
if (save_marks_in_cache)
{
marks = mark_cache->getOrSet(key, load_func);
}
else
{
marks = mark_cache->get(key);
if (!marks)
marks = load_func();
}
}
else
marks = load_func();
if (!marks)
throw Exception("Failed to load marks: " + mrk_path, ErrorCodes::LOGICAL_ERROR);
}
}

View File

@ -0,0 +1,35 @@
#include <Storages/MarkCache.h>
namespace DB
{
class MergeTreeMarksLoader
{
public:
using MarksPtr = MarkCache::MappedPtr;
using LoadFunc = std::function<MarksPtr()>;
MergeTreeMarksLoader() {}
MergeTreeMarksLoader(MarkCache * mark_cache_,
const String & mrk_path_,
const LoadFunc & load_func_,
bool save_marks_in_cache_,
size_t columns_num_ = 1);
const MarkInCompressedFile & getMark(size_t row_index, size_t column_index = 0);
bool initialized() const { return marks != nullptr; }
private:
MarkCache * mark_cache = nullptr;
String mrk_path;
LoadFunc load_func;
bool save_marks_in_cache = false;
size_t columns_num;
MarksPtr marks;
void loadMarks();
};
}

View File

@ -19,6 +19,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(const MergeTreeData::DataPartPtr
, uncompressed_cache_, mark_cache_, mark_ranges_
, settings_, avg_value_size_hints_)
{
initMarksLoader();
size_t buffer_size = settings.max_read_buffer_size;
if (uncompressed_cache)
@ -121,13 +122,14 @@ void MergeTreeReaderCompact::readData(
}
void MergeTreeReaderCompact::loadMarks()
void MergeTreeReaderCompact::initMarksLoader()
{
const auto & index_granularity_info = data_part->index_granularity_info;
size_t marks_count = data_part->getMarksCount();
std::string mrk_path = index_granularity_info.getMarksFilePath(path + NAME_OF_FILE_WITH_DATA);
size_t columns_num = data_part->columns.size();
auto load_func = [&]() -> MarkCache::MappedPtr
auto load = [&]() -> MarkCache::MappedPtr
{
size_t file_size = Poco::File(mrk_path).getSize();
@ -140,7 +142,6 @@ void MergeTreeReaderCompact::loadMarks()
/// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache.
auto temporarily_disable_memory_tracker = getCurrentMemoryTrackerActionLock();
size_t columns_num = data_part->columns.size();
auto res = std::make_shared<MarksInCompressedFile>(marks_count * columns_num);
@ -168,25 +169,14 @@ void MergeTreeReaderCompact::loadMarks()
return res;
};
std::cerr << "(MergeTreeReaderCompact::loadMarks) table: " << storage.getTableName() << ", part: " << path << "\n";
std::cerr << "(MergeTreeReaderCompact::loadMarks) start marks load..." << "\n";
auto marks_array = IMergeTreeReader::loadMarks(mrk_path, load_func);
marks = MarksInCompressedFileCompact(marks_array, columns.size());
marks_loader = MergeTreeMarksLoader{mark_cache, mrk_path, load, settings.save_marks_in_cache, columns_num};
std::cerr << "(MergeTreeReaderCompact::loadMarks) end marks load..." << "\n";
}
const MarkInCompressedFile & MergeTreeReaderCompact::getMark(size_t row, size_t col)
void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index)
{
if (!marks.initialized())
loadMarks();
return marks.getMark(row, col);
}
void MergeTreeReaderCompact::seekToMark(size_t row, size_t col)
{
MarkInCompressedFile mark = getMark(row, col);
MarkInCompressedFile mark = marks_loader.getMark(row_index, column_index);
std::cerr << "(MergeTreeReaderCompact::seekToMark) mark: (" << mark.offset_in_compressed_file << ", " << mark.offset_in_decompressed_block << "\n";
@ -201,7 +191,7 @@ void MergeTreeReaderCompact::seekToMark(size_t row, size_t col)
{
/// Better diagnostics.
if (e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND)
e.addMessage("(while seeking to mark (" + toString(row) + ", " + toString(col) + ")");
e.addMessage("(while seeking to mark (" + toString(row_index) + ", " + toString(column_index) + ")");
throw;
}

View File

@ -8,38 +8,6 @@
namespace DB
{
class MarksInCompressedFileCompact
{
public:
using MarksPtr = MarkCache::MappedPtr;
MarksInCompressedFileCompact() = default;
MarksInCompressedFileCompact(const MarksPtr & data_, size_t columns_num_)
: data(data_), columns_num(columns_num_) {}
const MarkInCompressedFile & getMark(size_t index, size_t column) const
{
return (*data)[index * columns_num + column];
}
char * getRowAddress(size_t index) const
{
return reinterpret_cast<char *>(data->data() + index * columns_num);
}
size_t getRowSize() const
{
return sizeof(MarkInCompressedFile) * columns_num;
}
bool initialized() { return data != nullptr; }
private:
MarksPtr data;
size_t columns_num;
};
/// Reads the data between pairs of marks in the same part. When reading consecutive ranges, avoids unnecessary seeks.
/// When ranges are almost consecutive, seeks are fast because they are performed inside the buffer.
/// Avoids loading the marks file if it is not needed (e.g. when reading the whole part).
@ -63,9 +31,9 @@ private:
std::unique_ptr<CachedCompressedReadBuffer> cached_buffer;
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer;
MarksInCompressedFileCompact marks;
MergeTreeMarksLoader marks_loader;
void loadMarks();
void initMarksLoader();
void seekToStart();
void seekToMark(size_t row, size_t col);
const MarkInCompressedFile & getMark(size_t row, size_t col);

View File

@ -32,6 +32,8 @@ MergeTreeReaderStream::MergeTreeReaderStream(
/// Care should be taken to not load marks when the part is empty (marks_count == 0).
initMarksLoader();
for (const auto & mark_range : all_mark_ranges)
{
size_t left_mark = mark_range.begin;
@ -41,10 +43,10 @@ MergeTreeReaderStream::MergeTreeReaderStream(
/// and we will use max_read_buffer_size for buffer size, thus avoiding the need to load marks.
/// If the end of range is inside the block, we will need to read it too.
if (right_mark < marks_count && getMark(right_mark).offset_in_decompressed_block > 0)
if (right_mark < marks_count && marks_loader.getMark(right_mark).offset_in_decompressed_block > 0)
{
while (right_mark < marks_count
&& getMark(right_mark).offset_in_compressed_file == getMark(mark_range.end).offset_in_compressed_file)
&& marks_loader.getMark(right_mark).offset_in_compressed_file == marks_loader.getMark(mark_range.end).offset_in_compressed_file)
{
++right_mark;
}
@ -55,13 +57,13 @@ MergeTreeReaderStream::MergeTreeReaderStream(
/// If there are no marks after the end of range, just use file size
if (right_mark >= marks_count
|| (right_mark + 1 == marks_count
&& getMark(right_mark).offset_in_compressed_file == getMark(mark_range.end).offset_in_compressed_file))
&& marks_loader.getMark(right_mark).offset_in_compressed_file == marks_loader.getMark(mark_range.end).offset_in_compressed_file))
{
mark_range_bytes = file_size - (left_mark < marks_count ? getMark(left_mark).offset_in_compressed_file : 0);
mark_range_bytes = file_size - (left_mark < marks_count ? marks_loader.getMark(left_mark).offset_in_compressed_file : 0);
}
else
{
mark_range_bytes = getMark(right_mark).offset_in_compressed_file - getMark(left_mark).offset_in_compressed_file;
mark_range_bytes = marks_loader.getMark(right_mark).offset_in_compressed_file - marks_loader.getMark(left_mark).offset_in_compressed_file;
}
max_mark_range_bytes = std::max(max_mark_range_bytes, mark_range_bytes);
@ -101,16 +103,11 @@ MergeTreeReaderStream::MergeTreeReaderStream(
}
const MarkInCompressedFile & MergeTreeReaderStream::getMark(size_t index)
void MergeTreeReaderStream::initMarksLoader()
{
if (!marks)
loadMarks();
return (*marks)[index];
}
if (marks_loader.initialized())
return;
void MergeTreeReaderStream::loadMarks()
{
std::string mrk_path = index_granularity_info->getMarksFilePath(path_prefix);
auto load = [&]() -> MarkCache::MappedPtr
@ -153,31 +150,13 @@ void MergeTreeReaderStream::loadMarks()
return res;
};
if (mark_cache)
{
auto key = mark_cache->hash(mrk_path);
if (save_marks_in_cache)
{
marks = mark_cache->getOrSet(key, load);
}
else
{
marks = mark_cache->get(key);
if (!marks)
marks = load();
}
}
else
marks = load();
if (!marks)
throw Exception("Failed to load marks: " + mrk_path, ErrorCodes::LOGICAL_ERROR);
marks_loader = MergeTreeMarksLoader{mark_cache, mrk_path, load, save_marks_in_cache};
}
void MergeTreeReaderStream::seekToMark(size_t index)
{
MarkInCompressedFile mark = getMark(index);
MarkInCompressedFile mark = marks_loader.getMark(index);
try
{

View File

@ -6,6 +6,7 @@
#include <Compression/CachedCompressedReadBuffer.h>
#include <Compression/CompressedReadBufferFromFile.h>
#include <Storages/MergeTree/MergeTreeReaderSettings.h>
#include <Storages/MergeTree/MergeTreeMarksLoader.h>
namespace DB
@ -33,7 +34,7 @@ private:
/// NOTE: lazily loads marks from the marks cache.
const MarkInCompressedFile & getMark(size_t index);
void loadMarks();
void initMarksLoader();
std::string path_prefix;
std::string data_file_extension;
@ -48,5 +49,7 @@ private:
std::unique_ptr<CachedCompressedReadBuffer> cached_buffer;
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer;
MergeTreeMarksLoader marks_loader;
};
}