polymorphic parts (development)

This commit is contained in:
CurtizJ 2019-10-19 19:49:36 +03:00
parent 8ba37da2ef
commit 3ebb2ab7c6
8 changed files with 140 additions and 265 deletions

View File

@ -0,0 +1,67 @@
#include <Storages/MergeTree/IMergeTreeDataPartWriter.h>
namespace DB
{
IMergeTreeDataPartWriter::IMergeTreeDataPartWriter(
const String & part_path_,
const NamesAndTypesList & columns_list_,
const IColumn::Permutation * permutation_,
const String & marks_file_extension_,
const CompressionCodecPtr & default_codec_,
size_t max_compress_block_size_,
size_t aio_threshold_)
: part_path(part_path_)
, columns_list(columns_list_)
, permutation(permutation_)
, marks_file_extension(marks_file_extension_)
, default_codec(default_codec_)
, max_compress_block_size(max_compress_block_size_)
, aio_threshold(aio_threshold_) {}
void IMergeTreeDataPartWriter::ColumnStream::finalize()
{
compressed.next();
plain_file->next();
marks.next();
}
void IMergeTreeDataPartWriter::ColumnStream::sync()
{
plain_file->sync();
marks_file.sync();
}
IMergeTreeDataPartWriter::ColumnStream::ColumnStream(
const String & escaped_column_name_,
const String & data_path_,
const std::string & data_file_extension_,
const std::string & marks_path_,
const std::string & marks_file_extension_,
const CompressionCodecPtr & compression_codec_,
size_t max_compress_block_size_,
size_t estimated_size_,
size_t aio_threshold_) :
escaped_column_name(escaped_column_name_),
data_file_extension{data_file_extension_},
marks_file_extension{marks_file_extension_},
plain_file(createWriteBufferFromFileBase(data_path_ + data_file_extension, estimated_size_, aio_threshold_, max_compress_block_size_)),
plain_hashing(*plain_file), compressed_buf(plain_hashing, compression_codec_), compressed(compressed_buf),
marks_file(marks_path_ + marks_file_extension, 4096, O_TRUNC | O_CREAT | O_WRONLY), marks(marks_file)
{
}
void IMergeTreeDataPartWriter::ColumnStream::addToChecksums(MergeTreeData::DataPart::Checksums & checksums)
{
String name = escaped_column_name;
checksums.files[name + data_file_extension].is_compressed = true;
checksums.files[name + data_file_extension].uncompressed_size = compressed.count();
checksums.files[name + data_file_extension].uncompressed_hash = compressed.getHash();
checksums.files[name + data_file_extension].file_size = plain_hashing.count();
checksums.files[name + data_file_extension].file_hash = plain_hashing.getHash();
checksums.files[name + marks_file_extension].file_size = marks.count();
checksums.files[name + marks_file_extension].file_hash = marks.getHash();
}
}

View File

@ -1,6 +1,7 @@
#include <Storages/MergeTree/MergeTreeIndexGranularity.h>
#include <Storages/MergeTree/MergeTreeIndexGranularityInfo.h>
#include <IO/WriteBufferFromFile.h>
#include <IO/WriteBufferFromFileBase.h>
#include <Compression/CompressedWriteBuffer.h>
#include <IO/HashingWriteBuffer.h>
#include <Storages/MergeTree/MergeTreeData.h>
@ -49,47 +50,41 @@ public:
void addToChecksums(MergeTreeData::DataPart::Checksums & checksums);
};
using ColumnStreamPtr = std::unique_ptr<ColumnStream>;
IMergeTreeDataPartWriter(
const String & part_path,
const MergeTreeData & storage,
const NamesAndTypesList & columns_list,
const IColumn::Permutation * permutation,
const String & marks_file_extension,
const CompressionCodecPtr & default_codec,
size_t max_compress_block_size,
size_t aio_threshold);
virtual size_t write(
const Block & block, size_t from_mark, size_t offset,
const Block & block, size_t from_mark, size_t offset, const MergeTreeIndexGranularity & index_granularity,
/* Blocks with already sorted index columns */
const Block & primary_key_block = {}, const Block & skip_indexes_block = {}) = 0;
virtual std::pair<size_t, size_t> writeColumn(
const String & name,
const IDataType & type,
const IColumn & column,
WrittenOffsetColumns & offset_columns,
bool skip_offsets,
IDataType::SerializeBinaryBulkStatePtr & serialization_state,
size_t from_mark) = 0;
// /// Write single granule of one column (rows between 2 marks)
// virtual size_t writeSingleGranule(
// const String & name,
// const IDataType & type,
// const IColumn & column,
// WrittenOffsetColumns & offset_columns,
// bool skip_offsets,
// IDataType::SerializeBinaryBulkStatePtr & serialization_state,
// IDataType::SerializeBinaryBulkSettings & serialize_settings,
// size_t from_row,
// size_t number_of_rows,
// bool write_marks) = 0;
// /// Write mark for column
// virtual void writeSingleMark(
// const String & name,
// const IDataType & type,
// WrittenOffsetColumns & offset_columns,
// bool skip_offsets,
// size_t number_of_rows,
// DB::IDataType::SubstreamPath & path) = 0;
protected:
void start();
using SerializationState = IDataType::SerializeBinaryBulkStatePtr;
using SerializationStates = std::vector<SerializationState>;
const NamesAndTypesList & columns_list;
IColumn::Permutation * permutation;
bool started = false;
String part_path;
NamesAndTypesList columns_list;
const IColumn::Permutation * permutation;
const String marks_file_extension;
const MergeTreeData & storage;
CompressionCodecPtr default_codec;
size_t min_compress_block_size;
size_t max_compress_block_size;
size_t aio_threshold;
};
using MergeTreeDataPartWriterPtr = std::unique_ptr<IMergeTreeDataPartWriter>;
}

View File

@ -1,6 +1,6 @@
#include <Storages/MergeTree/IMergedBlockOutputStream.h>
#include <IO/createWriteBufferFromFileBase.h>
#include <Storages/MergeTree/MergeTreeReaderSettings.h>
namespace DB
{
@ -17,25 +17,19 @@ namespace
IMergedBlockOutputStream::IMergedBlockOutputStream(
MergeTreeData & storage_,
const String & part_path_,
size_t min_compress_block_size_,
size_t max_compress_block_size_,
const MergeTreeDataPartPtr & data_part,
CompressionCodecPtr codec_,
size_t aio_threshold_,
const WriterSettings & writer_settings_,
bool blocks_are_granules_size_,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const MergeTreeIndexGranularity & index_granularity_,
const MergeTreeIndexGranularityInfo * index_granularity_info_)
: storage(storage_)
, part_path(part_path_)
, min_compress_block_size(min_compress_block_size_)
, max_compress_block_size(max_compress_block_size_)
, aio_threshold(aio_threshold_)
, can_use_adaptive_granularity(index_granularity_info_ ? index_granularity_info_->is_adaptive : storage.canUseAdaptiveGranularity())
, marks_file_extension(can_use_adaptive_granularity ? getAdaptiveMrkExtension() : getNonAdaptiveMrkExtension())
bool can_use_adaptive_granularity_)
: storage(data_part->storage)
, part_path(data_part->getFullPath())
, writer_settings(writer_settings_)
, can_use_adaptive_granularity(can_use_adaptive_granularity_)
, blocks_are_granules_size(blocks_are_granules_size_)
, index_granularity(index_granularity_)
, index_granularity(data_part->index_granularity)
, compute_granularity(index_granularity.empty())
, codec(std::move(codec_))
, skip_indices(indices_to_recalc)
@ -43,42 +37,10 @@ IMergedBlockOutputStream::IMergedBlockOutputStream(
{
if (blocks_are_granules_size && !index_granularity.empty())
throw Exception("Can't take information about index granularity from blocks, when non empty index_granularity array specified", ErrorCodes::LOGICAL_ERROR);
writer = data_part->getWriter(columns_list, permutation, default_codec, writer_settings);
}
void IMergedBlockOutputStream::addStreams(
const String & path,
const String & name,
const IDataType & type,
const CompressionCodecPtr & effective_codec,
size_t estimated_size,
bool skip_offsets)
{
IDataType::StreamCallback callback = [&] (const IDataType::SubstreamPath & substream_path)
{
if (skip_offsets && !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes)
return;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
/// Shared offsets for Nested type.
if (column_streams.count(stream_name))
return;
column_streams[stream_name] = std::make_unique<ColumnStream>(
stream_name,
path + stream_name, DATA_FILE_EXTENSION,
path + stream_name, marks_file_extension,
effective_codec,
max_compress_block_size,
estimated_size,
aio_threshold);
};
IDataType::SubstreamPath stream_path;
type.enumerateStreams(callback, stream_path);
}
IDataType::OutputStreamGetter IMergedBlockOutputStream::createStreamGetter(
const String & name, WrittenOffsetColumns & offset_columns, bool skip_offsets)
{
@ -150,39 +112,6 @@ void IMergedBlockOutputStream::fillIndexGranularity(const Block & block)
can_use_adaptive_granularity);
}
void IMergedBlockOutputStream::writeSingleMark(
const String & name,
const IDataType & type,
WrittenOffsetColumns & offset_columns,
bool skip_offsets,
size_t number_of_rows,
DB::IDataType::SubstreamPath & path)
{
type.enumerateStreams([&] (const IDataType::SubstreamPath & substream_path)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == IDataType::Substream::ArraySizes;
if (is_offsets && skip_offsets)
return;
String stream_name = IDataType::getFileNameForStream(name, substream_path);
/// Don't write offsets more than one time for Nested type.
if (is_offsets && offset_columns.count(stream_name))
return;
ColumnStream & stream = *column_streams[stream_name];
/// There could already be enough data to compress into the new block.
if (stream.compressed.offset() >= min_compress_block_size)
stream.compressed.next();
writeIntBinary(stream.plain_hashing.count(), stream.marks);
writeIntBinary(stream.compressed.offset(), stream.marks);
if (can_use_adaptive_granularity)
writeIntBinary(number_of_rows, stream.marks);
}, path);
}
size_t IMergedBlockOutputStream::writeSingleGranule(
const String & name,
const IDataType & type,
@ -421,50 +350,4 @@ void IMergedBlockOutputStream::finishSkipIndicesSerialization(
/// Implementation of IMergedBlockOutputStream::ColumnStream.
IMergedBlockOutputStream::ColumnStream::ColumnStream(
const String & escaped_column_name_,
const String & data_path_,
const std::string & data_file_extension_,
const std::string & marks_path_,
const std::string & marks_file_extension_,
const CompressionCodecPtr & compression_codec_,
size_t max_compress_block_size_,
size_t estimated_size_,
size_t aio_threshold_) :
escaped_column_name(escaped_column_name_),
data_file_extension{data_file_extension_},
marks_file_extension{marks_file_extension_},
plain_file(createWriteBufferFromFileBase(data_path_ + data_file_extension, estimated_size_, aio_threshold_, max_compress_block_size_)),
plain_hashing(*plain_file), compressed_buf(plain_hashing, compression_codec_), compressed(compressed_buf),
marks_file(marks_path_ + marks_file_extension, 4096, O_TRUNC | O_CREAT | O_WRONLY), marks(marks_file)
{
}
void IMergedBlockOutputStream::ColumnStream::finalize()
{
compressed.next();
plain_file->next();
marks.next();
}
void IMergedBlockOutputStream::ColumnStream::sync()
{
plain_file->sync();
marks_file.sync();
}
void IMergedBlockOutputStream::ColumnStream::addToChecksums(MergeTreeData::DataPart::Checksums & checksums)
{
String name = escaped_column_name;
checksums.files[name + data_file_extension].is_compressed = true;
checksums.files[name + data_file_extension].uncompressed_size = compressed.count();
checksums.files[name + data_file_extension].uncompressed_hash = compressed.getHash();
checksums.files[name + data_file_extension].file_size = plain_hashing.count();
checksums.files[name + data_file_extension].file_hash = plain_hashing.getHash();
checksums.files[name + marks_file_extension].file_size = marks.count();
checksums.files[name + marks_file_extension].file_hash = marks.getHash();
}
}

View File

@ -7,6 +7,7 @@
#include <IO/HashingWriteBuffer.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <DataStreams/IBlockOutputStream.h>
#include <Storages/MergeTree/IMergeTreeDataPartWriter.h>
namespace DB
@ -16,16 +17,13 @@ class IMergedBlockOutputStream : public IBlockOutputStream
{
public:
IMergedBlockOutputStream(
MergeTreeData & storage_,
const String & part_path_,
size_t min_compress_block_size_,
size_t max_compress_block_size_,
CompressionCodecPtr default_codec_,
size_t aio_threshold_,
const MergeTreeDataPartPtr & data_part,
CompressionCodecPtr codec_,
const WriterSettings & writer_settings_,
bool blocks_are_granules_size_,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const MergeTreeIndexGranularity & index_granularity_,
const MergeTreeIndexGranularityInfo * index_granularity_info_ = nullptr);
bool can_use_adaptive_granularity_);
using WrittenOffsetColumns = std::set<std::string>;
@ -33,83 +31,8 @@ protected:
using SerializationState = IDataType::SerializeBinaryBulkStatePtr;
using SerializationStates = std::vector<SerializationState>;
struct ColumnStream
{
ColumnStream(
const String & escaped_column_name_,
const String & data_path_,
const std::string & data_file_extension_,
const std::string & marks_path_,
const std::string & marks_file_extension_,
const CompressionCodecPtr & compression_codec_,
size_t max_compress_block_size_,
size_t estimated_size_,
size_t aio_threshold_);
String escaped_column_name;
std::string data_file_extension;
std::string marks_file_extension;
/// compressed -> compressed_buf -> plain_hashing -> plain_file
std::unique_ptr<WriteBufferFromFileBase> plain_file;
HashingWriteBuffer plain_hashing;
CompressedWriteBuffer compressed_buf;
HashingWriteBuffer compressed;
/// marks -> marks_file
WriteBufferFromFile marks_file;
HashingWriteBuffer marks;
void finalize();
void sync();
void addToChecksums(MergeTreeData::DataPart::Checksums & checksums);
};
using ColumnStreams = std::map<String, std::unique_ptr<ColumnStream>>;
void addStreams(const String & path, const String & name, const IDataType & type,
const CompressionCodecPtr & codec, size_t estimated_size, bool skip_offsets);
IDataType::OutputStreamGetter createStreamGetter(const String & name, WrittenOffsetColumns & offset_columns, bool skip_offsets);
/// Write data of one column.
/// Return how many marks were written and
/// how many rows were written for last mark
std::pair<size_t, size_t> writeColumn(
const String & name,
const IDataType & type,
const IColumn & column,
WrittenOffsetColumns & offset_columns,
bool skip_offsets,
IDataType::SerializeBinaryBulkStatePtr & serialization_state,
size_t from_mark
);
/// Write single granule of one column (rows between 2 marks)
size_t writeSingleGranule(
const String & name,
const IDataType & type,
const IColumn & column,
WrittenOffsetColumns & offset_columns,
bool skip_offsets,
IDataType::SerializeBinaryBulkStatePtr & serialization_state,
IDataType::SerializeBinaryBulkSettings & serialize_settings,
size_t from_row,
size_t number_of_rows,
bool write_marks);
/// Write mark for column
void writeSingleMark(
const String & name,
const IDataType & type,
WrittenOffsetColumns & offset_columns,
bool skip_offsets,
size_t number_of_rows,
DB::IDataType::SubstreamPath & path);
/// Count index_granularity for block and store in `index_granularity`
void fillIndexGranularity(const Block & block);
@ -130,15 +53,10 @@ protected:
SerializationStates serialization_states;
String part_path;
ColumnStreams column_streams;
/// The offset to the first row of the block for which you want to write the index.
size_t index_offset = 0;
size_t min_compress_block_size;
size_t max_compress_block_size;
size_t aio_threshold;
WriterSettings writer_settings;
size_t current_mark = 0;
@ -156,10 +74,12 @@ protected:
CompressionCodecPtr codec;
std::vector<MergeTreeIndexPtr> skip_indices;
std::vector<std::unique_ptr<ColumnStream>> skip_indices_streams;
std::vector<std::unique_ptr<IMergeTreeDataPartWriter::ColumnStream>> skip_indices_streams;
MergeTreeIndexAggregators skip_indices_aggregators;
std::vector<size_t> skip_index_filling;
std::unique_ptr<IMergeTreeDataPartWriter> writer;
const bool with_final_mark;
};

View File

@ -9,5 +9,16 @@ namespace DB
size_t max_read_buffer_size = 0;
bool save_marks_in_cache = false;
};
<<<<<<< HEAD
=======
struct WriterSettings
{
size_t min_compress_block_size;
size_t max_compress_block_size;
size_t aio_threshold;
String marks_file_extension;
};
>>>>>>> 03dc18db16... tmp
}

View File

@ -105,7 +105,7 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
{
/// Finish columns serialization.
{
auto & settings = storage.global_context.getSettingsRef();
const auto & settings = storage.global_context.getSettingsRef();
IDataType::SerializeBinaryBulkSettings serialize_settings;
serialize_settings.low_cardinality_max_dictionary_size = settings.low_cardinality_max_dictionary_size;
serialize_settings.low_cardinality_use_single_dictionary_for_part = settings.low_cardinality_use_single_dictionary_for_part != 0;
@ -242,8 +242,8 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
if (compute_granularity)
fillIndexGranularity(block);
/// The set of written offset columns so that you do not write shared offsets of nested structures columns several times
WrittenOffsetColumns offset_columns;
Block primary_key_block;
Block skip_indexes_block;
Block primary_key_block;
Block skip_indexes_block;
@ -293,12 +293,9 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
}
}
if (serialization_states.empty())
{
serialization_states.reserve(columns_list.size());
WrittenOffsetColumns tmp_offset_columns;
IDataType::SerializeBinaryBulkSettings settings;
size_t new_index_offset = writer->write(block, primary_key_block, skip_indexes_block, current_mark, index_offset);
<<<<<<< HEAD
for (const auto & col : columns_list)
{
settings.getter = createStreamGetter(col.name, tmp_offset_columns, false);
@ -340,6 +337,8 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
std::tie(std::ignore, new_index_offset) = writeColumn(column.name, *column.type, *column.column, offset_columns, false, serialization_states[i], current_mark);
}
}
=======
>>>>>>> 03dc18db16... tmp
std::cerr << "(MergedBlockOutputStream::writeImpl) new_index_offset: " << new_index_offset << "\n";

View File

@ -14,14 +14,14 @@ class MergedBlockOutputStream final : public IMergedBlockOutputStream
{
public:
MergedBlockOutputStream(
MergeTreeData & storage_,
const MergeTreeData & storage_,
const String & part_path_,
const NamesAndTypesList & columns_list_,
CompressionCodecPtr default_codec_,
bool blocks_are_granules_size_ = false);
MergedBlockOutputStream(
MergeTreeData & storage_,
const MergeTreeData & storage_,
const String & part_path_,
const NamesAndTypesList & columns_list_,
CompressionCodecPtr default_codec_,

View File

@ -4,7 +4,7 @@ namespace DB
{
MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
MergeTreeData & storage_, const Block & header_, const String & part_path_, bool sync_,
const MergeTreeData & storage_, const Block & header_, const String & part_path_, bool sync_,
CompressionCodecPtr default_codec_, bool skip_offsets_,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc_,
WrittenOffsetColumns & already_written_offset_columns_,