polymorphic parts (development) columns sizes

This commit is contained in:
CurtizJ 2019-12-03 17:33:56 +03:00
parent be0e13d28f
commit 31ffad0fb0
7 changed files with 70 additions and 31 deletions

View File

@ -366,6 +366,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks
loadChecksums(require_columns_checksums);
loadIndexGranularity();
loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity`
loadColumnSizes();
loadRowsCount(); /// Must be called after loadIndex() as it uses the value of `index_granularity`.
loadPartitionAndMinMaxIndex();
loadTTLInfos();
@ -588,6 +589,28 @@ void IMergeTreeDataPart::loadColumns(bool require)
sample_block.insert({it.type, it.name});
}
void IMergeTreeDataPart::loadColumnSizes()
{
size_t columns_num = columns.size();
if (columns_num == 0)
throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
auto column_sizes_path = getFullPath() + "columns_sizes.txt";
auto columns_sizes_file = Poco::File(column_sizes_path);
if (!columns_sizes_file.exists())
{
LOG_WARNING(storage.log, "No file column_sizes.txt in part " + name);
return;
}
ReadBufferFromFile buffer(column_sizes_path, columns_sizes_file.getSize());
auto it = columns.begin();
for (size_t i = 0; i < columns_num; ++i, ++it)
readPODBinary(columns_sizes[it->name], buffer);
assertEOF(buffer);
}
UInt64 IMergeTreeDataPart::calculateTotalSizeOnDisk(const String & from)
{
@ -638,6 +661,7 @@ void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_
relative_path = new_relative_path;
}
void IMergeTreeDataPart::remove() const
{
if (!isStoredOnDisk())

View File

@ -299,6 +299,8 @@ public:
*/
mutable std::mutex alter_mutex;
ColumnSizeByName columns_sizes;
/// For data in RAM ('index')
UInt64 getIndexSizeInBytes() const;
UInt64 getIndexSizeInAllocatedBytes() const;
@ -317,6 +319,7 @@ public:
static UInt64 calculateTotalSizeOnDisk(const String & from);
protected:
void removeIfNeeded();
virtual void checkConsistency(bool require_part_metadata) const;
@ -344,6 +347,8 @@ private:
void loadPartitionAndMinMaxIndex();
void loadColumnSizes();
String getRelativePathForDetachedPart(const String & prefix) const;
};

View File

@ -88,6 +88,11 @@ public:
return Columns(std::make_move_iterator(index_columns.begin()), std::make_move_iterator(index_columns.end()));
}
const MergeTreeData::ColumnSizeByName & getColumnsSizes() const
{
return columns_sizes;
}
void initSkipIndices();
void initPrimaryIndex();
void calculateAndSerializePrimaryIndex(const Block & primary_index_block, size_t rows);
@ -143,6 +148,8 @@ protected:
bool data_written = false;
bool primary_index_initialized = false;
bool skip_indices_initialized = false;
MergeTreeData::ColumnSizeByName columns_sizes;
};
using MergeTreeWriterPtr = std::unique_ptr<IMergeTreeDataPartWriter>;

View File

@ -186,28 +186,6 @@ void MergeTreeDataPartCompact::loadIndexGranularity()
index_granularity.setInitialized();
}
void MergeTreeDataPartCompact::loadColumnSizes()
{
size_t columns_num = columns.size();
if (columns_num == 0)
throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
auto column_sizes_path = getFullPath() + "columns_sizes.txt";
auto columns_sizes_file = Poco::File(column_sizes_path);
if (!columns_sizes_file.exists())
{
LOG_WARNING(storage.log, "No file column_sizes.txt in part " + name);
return;
}
ReadBufferFromFile buffer(column_sizes_path, columns_sizes_file.getSize());
auto it = columns.begin();
for (size_t i = 0; i < columns_num; ++i, ++it)
readPODBinary(columns_sizes[it->name], buffer);
assertEOF(buffer);
}
void MergeTreeDataPartCompact::checkConsistency(bool require_part_metadata)
{
UNUSED(require_part_metadata);

View File

@ -83,12 +83,7 @@ private:
/// Loads marks index granularity into memory
void loadIndexGranularity() override;
void loadColumnSizes();
void checkConsistency(bool require_part_metadata);
ColumnSizeByName columns_sizes;
};

View File

@ -104,11 +104,7 @@ void MergeTreeDataPartWriterCompact::writeBlock(const Block & block)
writeIntBinary(rows_to_write, stream->marks);
for (const auto & it : columns_list)
{
writeIntBinary(stream->plain_hashing.count(), stream->marks);
writeIntBinary(stream->compressed.offset(), stream->marks);
next_row = writeColumnSingleGranule(block.getByName(it.name), current_row, rows_to_write);
}
++from_mark;
current_row = next_row;
@ -125,6 +121,13 @@ size_t MergeTreeDataPartWriterCompact::writeColumnSingleGranule(const ColumnWith
std::cerr << "(writeColumnSingleGranule) from_row: " << from_row << "\n";
std::cerr << "(writeColumnSingleGranule) number_of_rows: " << number_of_rows << "\n";
/// FIXME compressed size does not work
size_t old_compressed_size = stream->compressed_buf.getCompressedBytes() + stream->plain_hashing.count();
size_t old_uncompressed_size = stream->compressed.count();
writeIntBinary(stream->plain_hashing.count(), stream->marks);
writeIntBinary(stream->compressed.offset(), stream->marks);
IDataType::SerializeBinaryBulkStatePtr state;
IDataType::SerializeBinaryBulkSettings serialize_settings;
@ -136,6 +139,12 @@ size_t MergeTreeDataPartWriterCompact::writeColumnSingleGranule(const ColumnWith
column.type->serializeBinaryBulkWithMultipleStreams(*column.column, from_row, number_of_rows, serialize_settings, state);
column.type->serializeBinaryBulkStateSuffix(serialize_settings, state);
/// FIXME compressed size does not work
size_t compressed_size = stream->compressed_buf.getCompressedBytes() + stream->plain_hashing.count();
size_t uncompressed_size = stream->compressed.count();
columns_sizes[column.name].add(ColumnSize{0, compressed_size - old_compressed_size, uncompressed_size - old_uncompressed_size});
return from_row + number_of_rows;
}
@ -155,6 +164,10 @@ void MergeTreeDataPartWriterCompact::finishDataSerialization(IMergeTreeDataPart:
}
}
size_t marks_size = stream->marks.count();
for (auto it = columns_sizes.begin(); it != columns_sizes.end(); ++it)
it->second.marks = marks_size;
stream->finalize();
if (sync)
stream->sync();

View File

@ -132,6 +132,22 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
checksums.files["ttl.txt"].file_hash = out_hashing.getHash();
}
const auto & columns_sizes = writer->getColumnsSizes();
if (!columns_sizes.empty())
{
WriteBufferFromFile out(part_path + "columns_sizes.txt", 4096);
HashingWriteBuffer out_hashing(out);
for (const auto & column : columns_list)
{
auto it = columns_sizes.find(column.name);
if (it == columns_sizes.end())
throw Exception("Not found size for column " + column.name, ErrorCodes::LOGICAL_ERROR);
writePODBinary(it->second, out_hashing);
checksums.files["columns_sizes.txt"].file_size = out_hashing.count();
checksums.files["columns_sizes.txt"].file_hash = out_hashing.getHash();
}
}
{
/// Write a file with a description of columns.
WriteBufferFromFile out(part_path + "columns.txt", 4096);
@ -151,6 +167,7 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
new_part->checksums = checksums;
new_part->bytes_on_disk = checksums.getTotalSizeOnDisk();
new_part->index_granularity = writer->getIndexGranularity();
new_part->columns_sizes = columns_sizes;
std::cerr << "(writeSuffixAndFinalizePart) part: " << new_part->getFullPath() << "\n";
std::cerr << "(writeSuffixAndFinalizePart) marks_count: " << new_part->index_granularity.getMarksCount() << "\n";
}